[RFC 10/14] drm/xe/pxp: add a query for PXP status

Mon Jul 15 20:19:15 UTC 2024

On 7/15/2024 1:16 PM, Souza, Jose wrote:
> On Mon, 2024-07-15 at 13:06 -0700, Daniele Ceraolo Spurio wrote:
>> On 7/15/2024 12:52 PM, Souza, Jose wrote:
>>> On Mon, 2024-07-15 at 12:38 -0700, Daniele Ceraolo Spurio wrote:
>>>> On 7/15/2024 11:41 AM, Souza, Jose wrote:
>>>>> On Mon, 2024-07-15 at 11:03 -0700, Daniele Ceraolo Spurio wrote:
>>>>>> On 7/15/2024 10:54 AM, Souza, Jose wrote:
>>>>>>> On Fri, 2024-07-12 at 14:28 -0700, Daniele Ceraolo Spurio wrote:
>>>>>>>> PXP prerequisites (SW proxy and HuC auth via GSC) are completed
>>>>>>>> asynchronously from driver load, which means that userspace can start
>>>>>>>> submitting before we're ready to start a PXP session. Therefore, we need
>>>>>>>> a query that userspace can use to check not only if PXP is supported by
>>>>>>>> also to wait until the prerequisites are done.
>>>>>>>>
>>>>>>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/xe/xe_pxp.c   | 33 +++++++++++++++++++++++++++++++++
>>>>>>>>      drivers/gpu/drm/xe/xe_pxp.h   |  1 +
>>>>>>>>      drivers/gpu/drm/xe/xe_query.c | 33 +++++++++++++++++++++++++++++++++
>>>>>>>>      include/uapi/drm/xe_drm.h     | 28 ++++++++++++++++++++++++++++
>>>>>>>>      4 files changed, 95 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c
>>>>>>>> index e979b47f155a..e39a47aeb050 100644
>>>>>>>> --- a/drivers/gpu/drm/xe/xe_pxp.c
>>>>>>>> +++ b/drivers/gpu/drm/xe/xe_pxp.c
>>>>>>>> @@ -60,6 +60,39 @@ static bool pxp_prerequisites_done(const struct xe_pxp *pxp)
>>>>>>>>      	return ready;
>>>>>>>>      }
>>>>>>>>      
>>>>>>>> +/**
>>>>>>>> + * xe_pxp_get_readiness_status - check whether PXP is ready for userspace use
>>>>>>>> + * @pxp: the xe_pxp pointer (can be NULL if PXP is disabled)
>>>>>>>> + *
>>>>>>>> + * This function is used for status query from userspace, so the returned value
>>>>>>>> + * follow the uapi (see drm_xe_query_pxp_status)
>>>>>>>> + *
>>>>>>>> + * Returns: 0 if PXP is not ready yet, 1 if it is ready, an errno value if PXP
>>>>>>>> + * is not supported/enabled or if something went wrong in the initialization of
>>>>>>>> + * the prerequisites.
>>>>>>>> + */
>>>>>>>> +int xe_pxp_get_readiness_status(struct xe_pxp *pxp)
>>>>>>>> +{
>>>>>>>> +	int ret = 0;
>>>>>>>> +
>>>>>>>> +	if (!xe_pxp_is_enabled(pxp))
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	/* if the GSC or HuC FW are in an error state, PXP will never work */
>>>>>>>> +	if (xe_uc_fw_status_to_error(pxp->gt->uc.huc.fw.status) ||
>>>>>>>> +	    xe_uc_fw_status_to_error(pxp->gt->uc.gsc.fw.status))
>>>>>>>> +		return -EIO;
>>>>>>>> +
>>>>>>>> +	xe_pm_runtime_get(pxp->xe);
>>>>>>>> +
>>>>>>>> +	/* PXP requires both HuC loaded and GSC proxy initialized */
>>>>>>>> +	if (pxp_prerequisites_done(pxp))
>>>>>>>> +		ret = 1;
>>>>>>>> +
>>>>>>>> +	xe_pm_runtime_put(pxp->xe);
>>>>>>>> +	return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static bool pxp_session_is_in_play(struct xe_pxp *pxp, u32 id)
>>>>>>>>      {
>>>>>>>>      	struct xe_gt *gt = pxp->gt;
>>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_pxp.h b/drivers/gpu/drm/xe/xe_pxp.h
>>>>>>>> index 8f0a3a514fb8..b2aae4abdd0e 100644
>>>>>>>> --- a/drivers/gpu/drm/xe/xe_pxp.h
>>>>>>>> +++ b/drivers/gpu/drm/xe/xe_pxp.h
>>>>>>>> @@ -14,6 +14,7 @@ struct xe_pxp;
>>>>>>>>      
>>>>>>>>      bool xe_pxp_is_supported(const struct xe_device *xe);
>>>>>>>>      bool xe_pxp_is_enabled(const struct xe_pxp *pxp);
>>>>>>>> +int xe_pxp_get_readiness_status(struct xe_pxp *pxp);
>>>>>>>>      
>>>>>>>>      int xe_pxp_init(struct xe_device *xe);
>>>>>>>>      void xe_pxp_irq_handler(struct xe_device *xe, u16 iir);
>>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
>>>>>>>> index 4e01df6b1b7a..5da9f403c2b9 100644
>>>>>>>> --- a/drivers/gpu/drm/xe/xe_query.c
>>>>>>>> +++ b/drivers/gpu/drm/xe/xe_query.c
>>>>>>>> @@ -22,6 +22,7 @@
>>>>>>>>      #include "xe_guc_hwconfig.h"
>>>>>>>>      #include "xe_macros.h"
>>>>>>>>      #include "xe_mmio.h"
>>>>>>>> +#include "xe_pxp.h"
>>>>>>>>      #include "xe_ttm_vram_mgr.h"
>>>>>>>>      
>>>>>>>>      static const u16 xe_to_user_engine_class[] = {
>>>>>>>> @@ -678,6 +679,37 @@ static int query_oa_units(struct xe_device *xe,
>>>>>>>>      	return ret ? -EFAULT : 0;
>>>>>>>>      }
>>>>>>>>      
>>>>>>>> +static int query_pxp_status(struct xe_device *xe, struct drm_xe_device_query *query)
>>>>>>>> +{
>>>>>>>> +	struct drm_xe_query_pxp_status __user *query_ptr = u64_to_user_ptr(query->data);
>>>>>>>> +	size_t size = sizeof(struct drm_xe_query_pxp_status);
>>>>>>>> +	struct drm_xe_query_pxp_status resp;
>>>>>>>> +	int ret;
>>>>>>>> +
>>>>>>>> +	if (query->size == 0) {
>>>>>>>> +		query->size = size;
>>>>>>>> +		return 0;
>>>>>>>> +	} else if (XE_IOCTL_DBG(xe, query->size != size)) {
>>>>>>>> +		return -EINVAL;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	if (copy_from_user(&resp, query_ptr, size))
>>>>>>>> +		return -EFAULT;
>>>>>>>> +
>>>>>>>> +	ret = xe_pxp_get_readiness_status(xe->pxp);
>>>>>>>> +	if (ret < 0)
>>>>>>>> +		return ret;
>>>>>>>> +
>>>>>>>> +	resp.status = ret;
>>>>>>> when status is == 0? does it means that will always succeed? it just matter of time.
>>>>>> no, it just means the init is still in progress. It can still end up in
>>>>>> an error state if something goes wrong during the init (e.g. GSC proxy
>>>>>> init failure), although that's unlikely.
>>>>>>
>>>>>>> For i915 in some older kernel versions we had to create a gem_context with I915_CONTEXT_PARAM_PROTECTED_CONTENT set to verify for sure if supported or
>>>>>>> not.
>>>>>> This query is enough to confirm. That's true for i915 as well, the query
>>>>>> was added explicitly to avoid having to create a context just to check
>>>>>> for PXP.
>>>>> But as you said above, if status == 0 it could still fail, so UMD will still need create a exec_queue to verify because UMD needs to report to
>>>>> applications if protected stuff is supported or not as one of first steps of Vulkan API.
>>>> If you need the answer and the status is 0, you need to wait a bit an
>>>> then check again. There is no other safe way of knowing if PXP fully is
>>>> supported or not.
>>>> Note that we expect this to complete relatively quickly after driver
>>>> load; the init itself takes ~500ms, but since it is done in a worker
>>>> thread it can be delayed a bit if the system is busy. The only exception
>>>> is if the mei driver doesn't load, for which case the timeout is 10s,
>>>> but that should in theory never happen unless the HW is in a very bad
>>>> state or the mei driver is totally busted.
>>>>
>>>>>    From what I looked to the code, exec_queue will return EBUSY if UMD tries to create a exec_queue with DRM_XE_PXP_TYPE_HWDRM but
>>>>> pxp_prerequisites_done() is still returning false. Can't you make something block in KMD until PXP initialization is running? Maybe add a blocking
>>>>> parameter in the status uAPI?
>>>> I can't block the driver init given that GSC init can be slow and we
>>>> don't want to delay the whole driver load by 500ms or more.
>>> I mean not block driver init but block exec_queue_create or DRM_XE_DEVICE_QUERY_PXP_STATUS.
>>>
>>>> I could add a parameter to block the pxp_status query until the result
>>>> is certain, but is there any reason you can't do such a wait in
>>>> userspace? I think it's better to block in userspace when possible and
>>>> it should be relatively simple to implement, something like:
>>>>
>>>> while (time_ms < timeout_ms) {
>>>>            err = ioctl (<query pxp status>);
>>>>            if (err || pxp_status == 1)
>>>>                    break;
>>>>
>>>>            msleep(50);
>>>>            time_ms += 50;
>>>> }
>>> The problem of doing this in UMD is that we would need to wait up to 10s in all cases.
>>> KMD could do some checking like if mei is loaded then wait up to 500ms, if not ready after 500ms mark as not supported.
>> We actually can't do any extra checking inside KMD. I haven't found a
>> way of figuring out if mei is working; the mei driver has a number of
>> sub-components, one of which is the one we need, and I couldn't find any
>> way to figure out if it's not up because we're still waiting for it to
>> load or if something went wrong. The only thing we can do is timeout on
>> it (more on this below).
>>
>>> And from what I understood there is no timeout in pxp_prerequisites_done() so even after 10s it would still return 1, so the next application starting
>>> would need to wait again up to 10s.
>> There is a timeout on the GSC FW init and HuC auth, so the FW status
>> will move to error if the init doesn't complete or something goes wrong.
>> We check the FW status at the beginning of
>> xe_pxp_get_readiness_status(), so  all calls from that point onward
>> would return an error code. The status can only be 0 in the first 10s
>> from driver load (which is the worst case and should never happen, in
>> normal scenario it should resolve within 1s even if there is a problem),
>> after that it'll either move to 1 or return an error.
> Okay will do the wait on UMD but can you add some comments to drm_xe_query_pxp_status with the time expected for PXP to finish initialization?
> Mesa reviewers will ask from where did I got the timeout value.

Sure. I'll also try make it clearer that the status will move one way or 
the other within that timeout, so follow-up queries won't have to wait.

Daniele

>
>>>> I have something similar in the local IGT that I'm using for testing.
>>>>
>>>> Also note that the proposed behavior is basically the same as i915.
>>>> Apart from the defines, the only difference is that in i915 we wait for
>>>> a bit (250ms) on context creation, but if PXP is not ready within that
>>>> time we do return a try again later error. Relevant i915 snippet:
>>>>
>>>> ret = intel_pxp_get_readiness_status(pxp, PXP_READINESS_TIMEOUT);
>>>> if (ret < 0) {
>>>>            drm_dbg(&pxp->ctrl_gt->i915->drm, "PXP: tried but not-avail
>>>> (%d)", ret);
>>>>            return ret;
>>>> } else if (ret > 1) {
>>>>            return -EIO; /* per UAPI spec, user may retry later */
>>>> }
>>>>
>>>>>>>> +	resp.supported_session_types =
>>>>>>>> +		BIT(DRM_XE_PXP_TYPE_NONE) | BIT(DRM_XE_PXP_TYPE_HWDRM);
>>>>>>> Looks odd to me that you return DRM_XE_PXP_TYPE_NONE as supported...
>>>>>> I just added it in to have all values of the enum represented. I can
>>>>>> take it out no problem if you think it is clearer that way, I'll just
>>>>>> add a note in the interface doc that DRM_XE_PXP_TYPE_NONE is always
>>>>>> supported.
>>>>> Ah just notice that enum drm_xe_pxp_session_type is set in exec_queue. But still, if PXP is not supported/available this will fail, so I can't see a
>>>>> case were only DRM_XE_PXP_TYPE_NONE is returned.
>>>>> So I don't think you need to return it as supported.
>>>> ok, I'll update it.
>>>>
>>>> Daniele
>>>>
>>>>>> Daniele
>>>>>>
>>>>>>>> +
>>>>>>>> +	if (copy_to_user(query_ptr, &resp, size))
>>>>>>>> +		 return -EFAULT;
>>>>>>>> +
>>>>>>>> +	return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static int (* const xe_query_funcs[])(struct xe_device *xe,
>>>>>>>>      				      struct drm_xe_device_query *query) = {
>>>>>>>>      	query_engines,
>>>>>>>> @@ -689,6 +721,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
>>>>>>>>      	query_engine_cycles,
>>>>>>>>      	query_uc_fw_version,
>>>>>>>>      	query_oa_units,
>>>>>>>> +	query_pxp_status,
>>>>>>>>      };
>>>>>>>>      
>>>>>>>>      int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>>>>>>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>>>>>>>> index 631fdc2ed493..746bc16bd220 100644
>>>>>>>> --- a/include/uapi/drm/xe_drm.h
>>>>>>>> +++ b/include/uapi/drm/xe_drm.h
>>>>>>>> @@ -619,6 +619,32 @@ struct drm_xe_query_uc_fw_version {
>>>>>>>>      	__u64 reserved;
>>>>>>>>      };
>>>>>>>>      
>>>>>>>> +/**
>>>>>>>> + * struct drm_xe_query_pxp_status - query if PXP is ready
>>>>>>>> + *
>>>>>>>> + * If PXP is enabled and no fatal error as occurred, the status will be set to
>>>>>>>> + * one of the following values:
>>>>>>>> + * 0: PXP init still in progress
>>>>>>>> + * 1: PXP init complete
>>>>>>>> + *
>>>>>>>> + * The supported session type bitmask is based on the values in
>>>>>>>> + * enum drm_xe_pxp_session_type, including TYPE_NONE.
>>>>>>>> + *
>>>>>>>> + * If PXP is not enabled or something has gone wrong, the query will be failed
>>>>>>>> + * with one of the following error codes:
>>>>>>>> + * -ENODEV: PXP not supported or disabled;
>>>>>>>> + * -EIO: fatal error occurred during init, so PXP will never be enabled;
>>>>>>>> + * -EINVAL: incorrect value provided as part of the query;
>>>>>>>> + * -EFAULT: error copying the memory between kernel and userspace.
>>>>>>>> + */
>>>>>>>> +struct drm_xe_query_pxp_status {
>>>>>>>> +	/** @status: current PXP status */
>>>>>>>> +	__u32 status;
>>>>>>>> +
>>>>>>>> +	/** @supported_session_types: bitmask of supported PXP session types */
>>>>>>>> +	__u32 supported_session_types;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>      /**
>>>>>>>>       * struct drm_xe_device_query - Input of &DRM_IOCTL_XE_DEVICE_QUERY - main
>>>>>>>>       * structure to query device information
>>>>>>>> @@ -638,6 +664,7 @@ struct drm_xe_query_uc_fw_version {
>>>>>>>>       *    attributes.
>>>>>>>>       *  - %DRM_XE_DEVICE_QUERY_GT_TOPOLOGY
>>>>>>>>       *  - %DRM_XE_DEVICE_QUERY_ENGINE_CYCLES
>>>>>>>> + *  - %DRM_XE_DEVICE_QUERY_PXP_STATUS
>>>>>>>>       *
>>>>>>>>       * If size is set to 0, the driver fills it with the required size for
>>>>>>>>       * the requested type of data to query. If size is equal to the required
>>>>>>>> @@ -690,6 +717,7 @@ struct drm_xe_device_query {
>>>>>>>>      #define DRM_XE_DEVICE_QUERY_ENGINE_CYCLES	6
>>>>>>>>      #define DRM_XE_DEVICE_QUERY_UC_FW_VERSION	7
>>>>>>>>      #define DRM_XE_DEVICE_QUERY_OA_UNITS		8
>>>>>>>> +#define DRM_XE_DEVICE_QUERY_PXP_STATUS		9
>>>>>>>>      	/** @query: The type of data to query */
>>>>>>>>      	__u32 query;
>>>>>>>>