[Intel-gfx] [PATCH 4/6] drm/i915/uc: Enhancements to firmware table validation
John Harrison
john.c.harrison at intel.com
Sat Apr 29 00:30:47 UTC 2023
On 4/28/2023 17:26, Ceraolo Spurio, Daniele wrote:
> On 4/28/2023 5:16 PM, John Harrison wrote:
>> On 4/28/2023 17:04, Ceraolo Spurio, Daniele wrote:
>>> On 4/20/2023 6:15 PM, John.C.Harrison at Intel.com wrote:
>>>> From: John Harrison <John.C.Harrison at Intel.com>
>>>>
>>>> The validation of the firmware table was being done inside the code
>>>> for scanning the table for the next available firmware blob. Which is
>>>> unnecessary. So pull it out into a separate function that is only
>>>> called once per blob type at init time.
>>>>
>>>> Also, drop the CONFIG_SELFTEST requirement and make errors terminal.
>>>> It was mentioned that potential issues with backports would not be
>>>> caught by regular pre-merge CI as that only occurs on tip not stable
>>>> branches. Making the validation unconditional and failing driver load
>>>> on detecting of a problem ensures that such backports will also be
>>>> validated correctly.
>>>>
>>>> v2: Change to unconditionally fail module load on a validation error
>>>> (review feedback/discussion with Daniele).
>>>>
>>>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>> ---
>>>> drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 157
>>>> +++++++++++++----------
>>>> 1 file changed, 92 insertions(+), 65 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
>>>> b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
>>>> index c9cd9bb47577f..eb52e8db9ae0b 100644
>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
>>>> @@ -233,20 +233,22 @@ struct fw_blobs_by_type {
>>>> u32 count;
>>>> };
>>>> +static const struct uc_fw_platform_requirement blobs_guc[] = {
>>>> + INTEL_GUC_FIRMWARE_DEFS(MAKE_FW_LIST, GUC_FW_BLOB,
>>>> GUC_FW_BLOB_MMP)
>>>> +};
>>>> +
>>>> +static const struct uc_fw_platform_requirement blobs_huc[] = {
>>>> + INTEL_HUC_FIRMWARE_DEFS(MAKE_FW_LIST, HUC_FW_BLOB,
>>>> HUC_FW_BLOB_MMP, HUC_FW_BLOB_GSC)
>>>> +};
>>>> +
>>>> +static const struct fw_blobs_by_type
>>>> blobs_all[INTEL_UC_FW_NUM_TYPES] = {
>>>> + [INTEL_UC_FW_TYPE_GUC] = { blobs_guc, ARRAY_SIZE(blobs_guc) },
>>>> + [INTEL_UC_FW_TYPE_HUC] = { blobs_huc, ARRAY_SIZE(blobs_huc) },
>>>> +};
>>>> +
>>>> static void
>>>> __uc_fw_auto_select(struct drm_i915_private *i915, struct
>>>> intel_uc_fw *uc_fw)
>>>> {
>>>> - static const struct uc_fw_platform_requirement blobs_guc[] = {
>>>> - INTEL_GUC_FIRMWARE_DEFS(MAKE_FW_LIST, GUC_FW_BLOB,
>>>> GUC_FW_BLOB_MMP)
>>>> - };
>>>> - static const struct uc_fw_platform_requirement blobs_huc[] = {
>>>> - INTEL_HUC_FIRMWARE_DEFS(MAKE_FW_LIST, HUC_FW_BLOB,
>>>> HUC_FW_BLOB_MMP, HUC_FW_BLOB_GSC)
>>>> - };
>>>> - static const struct fw_blobs_by_type
>>>> blobs_all[INTEL_UC_FW_NUM_TYPES] = {
>>>> - [INTEL_UC_FW_TYPE_GUC] = { blobs_guc,
>>>> ARRAY_SIZE(blobs_guc) },
>>>> - [INTEL_UC_FW_TYPE_HUC] = { blobs_huc,
>>>> ARRAY_SIZE(blobs_huc) },
>>>> - };
>>>> - static bool verified[INTEL_UC_FW_NUM_TYPES];
>>>> const struct uc_fw_platform_requirement *fw_blobs;
>>>> enum intel_platform p = INTEL_INFO(i915)->platform;
>>>> u32 fw_count;
>>>> @@ -286,6 +288,11 @@ __uc_fw_auto_select(struct drm_i915_private
>>>> *i915, struct intel_uc_fw *uc_fw)
>>>> continue;
>>>> if (uc_fw->file_selected.path) {
>>>> + /*
>>>> + * Continuing an earlier search after a found blob
>>>> failed to load.
>>>> + * Once the previously chosen path has been found,
>>>> clear it out
>>>> + * and let the search continue from there.
>>>> + */
>>>> if (uc_fw->file_selected.path == blob->path)
>>>> uc_fw->file_selected.path = NULL;
>>>> @@ -306,76 +313,91 @@ __uc_fw_auto_select(struct drm_i915_private
>>>> *i915, struct intel_uc_fw *uc_fw)
>>>> /* Failed to find a match for the last attempt?! */
>>>> uc_fw->file_selected.path = NULL;
>>>> }
>>>> +}
>>>> - /* make sure the list is ordered as expected */
>>>> - if (IS_ENABLED(CONFIG_DRM_I915_SELFTEST) &&
>>>> !verified[uc_fw->type]) {
>>>> - verified[uc_fw->type] = true;
>>>> +static bool validate_fw_table_type(struct drm_i915_private *i915,
>>>> enum intel_uc_fw_type type)
>>>> +{
>>>> + const struct uc_fw_platform_requirement *fw_blobs;
>>>> + u32 fw_count;
>>>> + int i;
>>>> - for (i = 1; i < fw_count; i++) {
>>>> - /* Next platform is good: */
>>>> - if (fw_blobs[i].p < fw_blobs[i - 1].p)
>>>> - continue;
>>>> + if (type >= ARRAY_SIZE(blobs_all)) {
>>>> + drm_err(&i915->drm, "No blob array for %s\n",
>>>> intel_uc_fw_type_repr(type));
>>>> + return false;
>>>> + }
>>>> - /* Next platform revision is good: */
>>>> - if (fw_blobs[i].p == fw_blobs[i - 1].p &&
>>>> - fw_blobs[i].rev < fw_blobs[i - 1].rev)
>>>> - continue;
>>>> + fw_blobs = blobs_all[type].blobs;
>>>> + fw_count = blobs_all[type].count;
>>>> - /* Platform/revision must be in order: */
>>>> - if (fw_blobs[i].p != fw_blobs[i - 1].p ||
>>>> - fw_blobs[i].rev != fw_blobs[i - 1].rev)
>>>> - goto bad;
>>>> + if (!fw_count)
>>>> + return true;
>>>> - /* Next major version is good: */
>>>> - if (fw_blobs[i].blob.major < fw_blobs[i - 1].blob.major)
>>>> - continue;
>>>> + /* make sure the list is ordered as expected */
>>>> + for (i = 1; i < fw_count; i++) {
>>>> + /* Next platform is good: */
>>>> + if (fw_blobs[i].p < fw_blobs[i - 1].p)
>>>> + continue;
>>>> - /* New must be before legacy: */
>>>> - if (!fw_blobs[i].blob.legacy && fw_blobs[i -
>>>> 1].blob.legacy)
>>>> - goto bad;
>>>> + /* Next platform revision is good: */
>>>> + if (fw_blobs[i].p == fw_blobs[i - 1].p &&
>>>> + fw_blobs[i].rev < fw_blobs[i - 1].rev)
>>>> + continue;
>>>> - /* New to legacy also means 0.0 to X.Y (HuC), or X.0
>>>> to X.Y (GuC) */
>>>> - if (fw_blobs[i].blob.legacy && !fw_blobs[i -
>>>> 1].blob.legacy) {
>>>> - if (!fw_blobs[i - 1].blob.major)
>>>> - continue;
>>>> + /* Platform/revision must be in order: */
>>>> + if (fw_blobs[i].p != fw_blobs[i - 1].p ||
>>>> + fw_blobs[i].rev != fw_blobs[i - 1].rev)
>>>> + goto bad;
>>>> - if (fw_blobs[i].blob.major == fw_blobs[i -
>>>> 1].blob.major)
>>>> - continue;
>>>> - }
>>>> + /* Next major version is good: */
>>>> + if (fw_blobs[i].blob.major < fw_blobs[i - 1].blob.major)
>>>> + continue;
>>>> - /* Major versions must be in order: */
>>>> - if (fw_blobs[i].blob.major != fw_blobs[i - 1].blob.major)
>>>> - goto bad;
>>>> + /* New must be before legacy: */
>>>> + if (!fw_blobs[i].blob.legacy && fw_blobs[i - 1].blob.legacy)
>>>> + goto bad;
>>>> - /* Next minor version is good: */
>>>> - if (fw_blobs[i].blob.minor < fw_blobs[i - 1].blob.minor)
>>>> + /* New to legacy also means 0.0 to X.Y (HuC), or X.0 to
>>>> X.Y (GuC) */
>>>> + if (fw_blobs[i].blob.legacy && !fw_blobs[i -
>>>> 1].blob.legacy) {
>>>> + if (!fw_blobs[i - 1].blob.major)
>>>> continue;
>>>> - /* Minor versions must be in order: */
>>>> - if (fw_blobs[i].blob.minor != fw_blobs[i - 1].blob.minor)
>>>> - goto bad;
>>>> -
>>>> - /* Patch versions must be in order: */
>>>> - if (fw_blobs[i].blob.patch <= fw_blobs[i - 1].blob.patch)
>>>> + if (fw_blobs[i].blob.major == fw_blobs[i - 1].blob.major)
>>>> continue;
>>>> + }
>>>> +
>>>> + /* Major versions must be in order: */
>>>> + if (fw_blobs[i].blob.major != fw_blobs[i - 1].blob.major)
>>>> + goto bad;
>>>> +
>>>> + /* Next minor version is good: */
>>>> + if (fw_blobs[i].blob.minor < fw_blobs[i - 1].blob.minor)
>>>> + continue;
>>>> +
>>>> + /* Minor versions must be in order: */
>>>> + if (fw_blobs[i].blob.minor != fw_blobs[i - 1].blob.minor)
>>>> + goto bad;
>>>> +
>>>> + /* Patch versions must be in order: */
>>>> + if (fw_blobs[i].blob.patch <= fw_blobs[i - 1].blob.patch)
>>>> + continue;
>>>> bad:
>>>> - drm_err(&i915->drm, "Invalid %s blob order: %s r%u
>>>> %s%d.%d.%d comes before %s r%u %s%d.%d.%d\n",
>>>> - intel_uc_fw_type_repr(uc_fw->type),
>>>> - intel_platform_name(fw_blobs[i - 1].p), fw_blobs[i
>>>> - 1].rev,
>>>> - fw_blobs[i - 1].blob.legacy ? "L" : "v",
>>>> - fw_blobs[i - 1].blob.major,
>>>> - fw_blobs[i - 1].blob.minor,
>>>> - fw_blobs[i - 1].blob.patch,
>>>> - intel_platform_name(fw_blobs[i].p), fw_blobs[i].rev,
>>>> - fw_blobs[i].blob.legacy ? "L" : "v",
>>>> - fw_blobs[i].blob.major,
>>>> - fw_blobs[i].blob.minor,
>>>> - fw_blobs[i].blob.patch);
>>>> -
>>>> - uc_fw->file_selected.path = NULL;
>>>> - }
>>>> + drm_err(&i915->drm, "Invalid %s blob order: %s r%u
>>>> %s%d.%d.%d comes before %s r%u %s%d.%d.%d\n",
>>>> + intel_uc_fw_type_repr(type),
>>>> + intel_platform_name(fw_blobs[i - 1].p), fw_blobs[i -
>>>> 1].rev,
>>>> + fw_blobs[i - 1].blob.legacy ? "L" : "v",
>>>> + fw_blobs[i - 1].blob.major,
>>>> + fw_blobs[i - 1].blob.minor,
>>>> + fw_blobs[i - 1].blob.patch,
>>>> + intel_platform_name(fw_blobs[i].p), fw_blobs[i].rev,
>>>> + fw_blobs[i].blob.legacy ? "L" : "v",
>>>> + fw_blobs[i].blob.major,
>>>> + fw_blobs[i].blob.minor,
>>>> + fw_blobs[i].blob.patch);
>>>> + return false;
>>>> }
>>>> +
>>>> + return true;
>>>> }
>>>> static const char *__override_guc_firmware_path(struct
>>>> drm_i915_private *i915)
>>>> @@ -443,6 +465,11 @@ void intel_uc_fw_init_early(struct intel_uc_fw
>>>> *uc_fw,
>>>> uc_fw->type = type;
>>>> if (HAS_GT_UC(i915)) {
>>>> + if (!validate_fw_table_type(i915, type)) {
>>>> + intel_uc_fw_change_status(uc_fw,
>>>> INTEL_UC_FIRMWARE_ERROR);
>>>
>>> In our hierarchy of firmware statuses, INTEL_UC_FIRMWARE_ERROR
>>> includes the fact that the fw has been selected, which in turns
>>> implies that file_selected.path is valid. this means that even with
>>> enable_guc=0 the wants/uses_guc macro might end up returning true,
>>> which is not something we want.
>>>
>>> Daniele
>> Suggestions for a better plan? Add an another status enum? Nothing
>> earlier in the sequence seems appropriate. And the init_early stack
>> does not support returning error codes.
>
> I think the question here is: what are you expecting to happen in case
> of error and on what platforms? let's say we have an invalid table
> entry for ADLP, would the expectation be that all GuC platforms won't
> boot, or just ADLP? And is that only if we have enable_guc set to a
> positive value, or even if enable_guc=0?
The intention is to totally break driver load on any table error.
The reason being that someone is back porting a firmware update to ADL-P
but breaks DG2 in the process. However, the are only intending to change
ADL-P and so don't test on DG2. They therefore don't realise that the
driver is now broken for someone else. Whereas, if we make any table
error a fatal load failure irrespective of tested platform, enable_guc
or other module params, etc. then it is guaranteed to be caught no
matter what platform they test on.
John.
>
> Daniele
>
>>
>> John.
>>
>>
>>>
>>>> + return;
>>>> + }
>>>> +
>>>> __uc_fw_auto_select(i915, uc_fw);
>>>> __uc_fw_user_override(i915, uc_fw);
>>>> }
>>>
>>
>
More information about the Intel-gfx
mailing list