[PATCH 2/2] RFC drm/xe: re-order lmem init check and wait for initialization to complete

Mon Mar 11 14:40:10 UTC 2024

On Fri, Mar 08, 2024 at 09:42:30AM -0500, Rodrigo Vivi wrote:
>On Fri, Mar 08, 2024 at 02:25:17PM +0530, Riana Tauro wrote:
>> Lmem init check should be done only after pcode initialization
>> status is complete. Move lmem init check after pcode status
>> check. Also wait for a short while after pcode status check
>> to allow completion of the task.
>>
>> Failing to do so, can lead to aborting the module load
>> leaving the system unusable. Wait until the lmem initialization
>> is complete within a timeout (60s) or till the user aborts.
>>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_device.c | 53 +++++++++++++++++++++++++++++++++-
>>  drivers/gpu/drm/xe/xe_mmio.c   | 29 -------------------
>>  2 files changed, 52 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>> index 83dd60f68566..4806e7806be5 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -413,12 +413,59 @@ static int xe_set_dma_info(struct xe_device *xe)
>>  	return err;
>>  }
>>
>> +static int verify_lmem_ready(struct xe_gt *gt)
>
>maybe bool?
>
>> +{
>> +	return xe_mmio_read32(gt, GU_CNTL) & LMEM_INIT;
>> +}
>> +
>> +static int wait_for_lmem_ready(struct xe_device *xe)
>> +{
>> +	struct xe_gt *gt = xe_root_mmio_gt(xe);
>> +	unsigned long timeout, start;
>> +
>> +	if (!IS_DGFX(xe))
>> +		return 0;
>> +
>> +	if (IS_SRIOV_VF(xe))
>> +		return 0;
>> +	/*
>> +	 * The boot firmware initializes local memory and assesses its health.
>> +	 * If memory training fails, the punit will have been instructed to
>> +	 * keep the GT powered down; we won't be able to communicate with it
>> +	 * and we should not continue with driver initialization.
>> +	 */
>
>the comment is negative in a positive outcome.
>I mean, one reading this comment above might conclude that we are returning below
>because we won't be able to communicate with the GT.
>
>but the code is right and we need this change. thanks for taking care of it.

I'm confused if it's indeed correct. See below

>
>Acked-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
>
>> +	if (verify_lmem_ready(gt))
>> +		return 0;
>> +
>> +	drm_dbg(&xe->drm, "Waiting for lmem initialisation\n");
>> +
>> +	start = jiffies;
>> +	timeout = start + msecs_to_jiffies(60 * 1000); /* 60 sec! */
>> +
>> +	do {
>> +		if (signal_pending(current))
>> +			return -EINTR;
>> +
>> +		if (time_after(jiffies, timeout))
>> +			return -EPROBE_DEFER;

-EPROBE_DEFER will instruct the driver core to re-attempt binding to the
device later.

however we are already doing the wait here, why are are returning
-EPROBE_DEFER to re-attempt it later? We need one or the other, not
both.

it would be good to have a fault injection here so we are actually
testing the error path since I don't think we are triggering this path
in any machine in CI.

Lucas De Marchi

>> +
>> +		msleep(20);
>> +
>> +	} while (!verify_lmem_ready(gt));
>> +
>> +	drm_dbg(&xe->drm, "lmem ready after %ums",
>> +		jiffies_to_msecs(jiffies - start));
>> +
>> +	return 0;
>> +}
>> +
>>  /**
>>   * xe_device_probe_early: Device early probe
>>   * @xe: xe device instance
>>   *
>>   * Initialize MMIO resources that don't require any
>> - * knowledge about tile count. Also initialize pcode
>> + * knowledge about tile count. Also initialize pcode and
>> + * check vram initialization on root tile.
>>   *
>>   * Return: 0 on success, error code on failure
>>   */
>> @@ -438,6 +485,10 @@ int xe_device_probe_early(struct xe_device *xe)
>>  	if (err)
>>  		return err;
>>
>> +	err = wait_for_lmem_ready(xe);
>> +	if (err)
>> +		return err;
>> +
>>  	return 0;
>>  }
>>
>> diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
>> index 7ba2477452d7..7fc0c5453b21 100644
>> --- a/drivers/gpu/drm/xe/xe_mmio.c
>> +++ b/drivers/gpu/drm/xe/xe_mmio.c
>> @@ -360,30 +360,6 @@ static void mmio_fini(struct drm_device *drm, void *arg)
>>  		iounmap(xe->mem.vram.mapping);
>>  }
>>
>> -static int xe_verify_lmem_ready(struct xe_device *xe)
>> -{
>> -	struct xe_gt *gt = xe_root_mmio_gt(xe);
>> -
>> -	if (!IS_DGFX(xe))
>> -		return 0;
>> -
>> -	if (IS_SRIOV_VF(xe))
>> -		return 0;
>> -
>> -	/*
>> -	 * The boot firmware initializes local memory and assesses its health.
>> -	 * If memory training fails, the punit will have been instructed to
>> -	 * keep the GT powered down; we won't be able to communicate with it
>> -	 * and we should not continue with driver initialization.
>> -	 */
>> -	if (!(xe_mmio_read32(gt, GU_CNTL) & LMEM_INIT)) {
>> -		drm_err(&xe->drm, "VRAM not initialized by firmware\n");
>> -		return -ENODEV;
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>>  int xe_mmio_init(struct xe_device *xe)
>>  {
>>  	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>> @@ -407,16 +383,11 @@ int xe_mmio_init(struct xe_device *xe)
>>  int xe_mmio_root_tile_init(struct xe_device *xe)
>>  {
>>  	struct xe_tile *root_tile = xe_device_get_root_tile(xe);
>> -	int err;
>>
>>  	/* Setup first tile; other tiles (if present) will be setup later. */
>>  	root_tile->mmio.size = SZ_16M;
>>  	root_tile->mmio.regs = xe->mmio.regs;
>>
>> -	err = xe_verify_lmem_ready(xe);
>> -	if (err)
>> -		return err;
>> -
>>  	return 0;
>>  }
>>
>> --
>> 2.40.0
>>