[PATCH 2/3] drm/xe/pf: Make sure PF is ready to configure VFs

Michal Wajdeczko michal.wajdeczko at intel.com
Wed Jul 30 21:44:34 UTC 2025



On 7/30/2025 11:09 PM, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of Michal Wajdeczko
> Sent: Wednesday, July 30, 2025 10:49 AM
> To: intel-xe at lists.freedesktop.org
> Cc: Wajdeczko, Michal <Michal.Wajdeczko at intel.com>
> Subject: [PATCH 2/3] drm/xe/pf: Make sure PF is ready to configure VFs
>>
>> The PF driver might be resumed just to configure VFs, but since
>> it is doing some asynchronous GuC reconfigurations after fresh
>> reset, we should wait until all pending works are completed.
>>
>> This is especially important in case of LMEM provisioning, since
>> we also need to update the LMTT and send invalidation requests
>> to all GuCs, which are expected to be already in the VGT mode.
>>
>> Fixes: 68ae022278a1 ("drm/xe/pf: Force GuC virtualization mode")
>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_gt_sriov_pf.c         | 28 +++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_gt_sriov_pf.h         |  1 +
>>  drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c |  4 ++-
>>  drivers/gpu/drm/xe/xe_pci_sriov.c           |  7 +++++-
>>  drivers/gpu/drm/xe/xe_sriov_pf.c            | 27 ++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_sriov_pf.h            |  1 +
>>  6 files changed, 66 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>> index 2761319fdc26..8bc7d7f9f47a 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>> @@ -16,6 +16,7 @@
>>  #include "xe_gt_sriov_pf_migration.h"
>>  #include "xe_gt_sriov_pf_service.h"
>>  #include "xe_gt_sriov_printk.h"
>> +#include "xe_guc_submit.h"
>>  #include "xe_mmio.h"
>>  #include "xe_pm.h"
>>  
>> @@ -258,3 +259,30 @@ void xe_gt_sriov_pf_restart(struct xe_gt *gt)
>>  {
>>  	pf_queue_restart(gt);
>>  }
>> +
>> +static bool pf_flush_restart(struct xe_gt *gt)
>> +{
>> +	xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); 
> 
> Asserting here seems unnecessary.  If we're certain that the pf functions
> will never be called in a non-pf context, then we probably don't need to
> assert the system is in sriov-pf mode here.  And if we aren't certain of this,
> it probably makes more sense to put the assertion in
> xe_gt_sriov_pf_wait_ready instead, or to have a separate return path instead
> of breaking instantly.

the purpose of using our xe_assert() is exactly to be certain that we never call
functions in an unexpected way, without having to analyze crash logs when someone
actually make a mistake 

and while I'm certain that today I'm not calling that wrong, I don't know what
others will do next day

and here in this function we are accessing a PF-only member of the union, so
like in the rest of the SRIOV code, we have explicit assert to tag function

and all xe_asserts() are compiled away on production builds, so no worry

> 
> Unless the system can exit sriov-pf mode in the middle of execution?  I don't
> think that's a possibility, though.
> 
> The assertion by itself isn't harmful if we're assuming the system would never
> hit the assertion in a non-sriov-pf context, though, so I won't block on it's
> removal.  Though a justification would do well here, if only for my sake.

the goal of asserts is that we don't expect they will be hit

> 
> Everything else is
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
> -Jonathan Cavitt
> 
>> +
>> +	return flush_work(&gt->sriov.pf.workers.restart);
>> +}
>> +
>> +/**
>> + * xe_gt_sriov_pf_wait_ready() - Wait until per-GT PF SR-IOV support is ready.
>> + * @gt: the &xe_gt
>> + *
>> + * This function can only be called on PF.
>> + *
>> + * Return: 0 on success or a negative error code on failure.
>> + */
>> +int xe_gt_sriov_pf_wait_ready(struct xe_gt *gt)
>> +{
>> +	/* don't wait if there is another ongoing reset */
>> +	if (xe_guc_read_stopped(&gt->uc.guc))
>> +		return -EBUSY;
>> +
>> +	if (pf_flush_restart(gt))
>> +		xe_gt_sriov_dbg_verbose(gt, "ready after restart\n");
>> +
>> +	return 0;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
>> index e2b2ff8132dc..e7fde3f9937a 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h
>> @@ -11,6 +11,7 @@ struct xe_gt;
>>  #ifdef CONFIG_PCI_IOV
>>  int xe_gt_sriov_pf_init_early(struct xe_gt *gt);
>>  int xe_gt_sriov_pf_init(struct xe_gt *gt);
>> +int xe_gt_sriov_pf_wait_ready(struct xe_gt *gt);
>>  void xe_gt_sriov_pf_init_hw(struct xe_gt *gt);
>>  void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid);
>>  void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt);
>> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>> index bf679b21f485..3ed245e04d0c 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c
>> @@ -22,6 +22,7 @@
>>  #include "xe_gt_sriov_pf_policy.h"
>>  #include "xe_gt_sriov_pf_service.h"
>>  #include "xe_pm.h"
>> +#include "xe_sriov_pf.h"
>>  
>>  /*
>>   *      /sys/kernel/debug/dri/0/
>> @@ -205,7 +206,8 @@ static int CONFIG##_set(void *data, u64 val)					\
>>  		return -EOVERFLOW;						\
>>  										\
>>  	xe_pm_runtime_get(xe);							\
>> -	err = xe_gt_sriov_pf_config_set_##CONFIG(gt, vfid, val);		\
>> +	err = xe_sriov_pf_wait_ready(xe) ?:					\
>> +	      xe_gt_sriov_pf_config_set_##CONFIG(gt, vfid, val);		\
>>  	xe_pm_runtime_put(xe);							\
>>  										\
>>  	return err;								\
>> diff --git a/drivers/gpu/drm/xe/xe_pci_sriov.c b/drivers/gpu/drm/xe/xe_pci_sriov.c
>> index 27b148eab49c..af05db07162e 100644
>> --- a/drivers/gpu/drm/xe/xe_pci_sriov.c
>> +++ b/drivers/gpu/drm/xe/xe_pci_sriov.c
>> @@ -16,6 +16,7 @@
>>  #include "xe_pci_sriov.h"
>>  #include "xe_pm.h"
>>  #include "xe_sriov.h"
>> +#include "xe_sriov_pf.h"
>>  #include "xe_sriov_pf_helpers.h"
>>  #include "xe_sriov_printk.h"
>>  
>> @@ -154,6 +155,10 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
>>  	xe_assert(xe, num_vfs <= total_vfs);
>>  	xe_sriov_dbg(xe, "enabling %u VF%s\n", num_vfs, str_plural(num_vfs));
>>  
>> +	err = xe_sriov_pf_wait_ready(xe);
>> +	if (err)
>> +		goto out;
>> +
>>  	/*
>>  	 * We must hold additional reference to the runtime PM to keep PF in D0
>>  	 * during VFs lifetime, as our VFs do not implement the PM capability.
>> @@ -191,7 +196,7 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
>>  failed:
>>  	pf_unprovision_vfs(xe, num_vfs);
>>  	xe_pm_runtime_put(xe);
>> -
>> +out:
>>  	xe_sriov_notice(xe, "Failed to enable %u VF%s (%pe)\n",
>>  			num_vfs, str_plural(num_vfs), ERR_PTR(err));
>>  	return err;
>> diff --git a/drivers/gpu/drm/xe/xe_sriov_pf.c b/drivers/gpu/drm/xe/xe_sriov_pf.c
>> index afbdd894bd6e..7e8f3d3ac656 100644
>> --- a/drivers/gpu/drm/xe/xe_sriov_pf.c
>> +++ b/drivers/gpu/drm/xe/xe_sriov_pf.c
>> @@ -9,6 +9,7 @@
>>  
>>  #include "xe_assert.h"
>>  #include "xe_device.h"
>> +#include "xe_gt_sriov_pf.h"
>>  #include "xe_module.h"
>>  #include "xe_sriov.h"
>>  #include "xe_sriov_pf.h"
>> @@ -102,6 +103,32 @@ int xe_sriov_pf_init_early(struct xe_device *xe)
>>  	return 0;
>>  }
>>  
>> +/**
>> + * xe_sriov_pf_wait_ready() - Wait until PF is ready to operate.
>> + * @xe: the &xe_device to test
>> + *
>> + * This function can only be called on PF.
>> + *
>> + * Return: 0 on success or a negative error code on failure.
>> + */
>> +int xe_sriov_pf_wait_ready(struct xe_device *xe)
>> +{
>> +	struct xe_gt *gt;
>> +	unsigned int id;
>> +	int err = 0;
>> +
>> +	if (xe_device_wedged(xe))
>> +		return -ECANCELED;
>> +
>> +	for_each_gt(gt, xe, id) {
>> +		err = xe_gt_sriov_pf_wait_ready(gt);
>> +		if (err)
>> +			break;
>> +	}
>> +
>> +	return err;
>> +}
>> +
>>  /**
>>   * xe_sriov_pf_print_vfs_summary - Print SR-IOV PF information.
>>   * @xe: the &xe_device to print info from
>> diff --git a/drivers/gpu/drm/xe/xe_sriov_pf.h b/drivers/gpu/drm/xe/xe_sriov_pf.h
>> index c392c3fcf085..e3b34f8f5e04 100644
>> --- a/drivers/gpu/drm/xe/xe_sriov_pf.h
>> +++ b/drivers/gpu/drm/xe/xe_sriov_pf.h
>> @@ -15,6 +15,7 @@ struct xe_device;
>>  #ifdef CONFIG_PCI_IOV
>>  bool xe_sriov_pf_readiness(struct xe_device *xe);
>>  int xe_sriov_pf_init_early(struct xe_device *xe);
>> +int xe_sriov_pf_wait_ready(struct xe_device *xe);
>>  void xe_sriov_pf_debugfs_register(struct xe_device *xe, struct dentry *root);
>>  void xe_sriov_pf_print_vfs_summary(struct xe_device *xe, struct drm_printer *p);
>>  #else
>> -- 
>> 2.47.1
>>
>>



More information about the Intel-xe mailing list