[PATCH 2/9] drm/amdgpu: Add helper funcs for jpeg devcoredump

Tue Jan 28 14:52:30 UTC 2025

On 1/28/2025 7:55 PM, Sundararaju, Sathishkumar wrote:
>
>
>
> On 1/28/2025 5:29 PM, Lazar, Lijo wrote:
>>
>> On 1/28/2025 5:06 PM, Sundararaju, Sathishkumar wrote:
>>> Hi Lijo,
>>>
>>>
>>> On 1/28/2025 3:04 PM, Lazar, Lijo wrote:
>>>> On 1/28/2025 2:39 PM, Sathishkumar S wrote:
>>>>> Add devcoredump helper functions that can be reused for all jpeg
>>>>> versions.
>>>>>
>>>>> Signed-off-by: Sathishkumar S <sathishkumar.sundararaju at amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 59 
>>>>> ++++++++++++++++++++++++
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  7 +++
>>>>>    2 files changed, 66 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/
>>>>> drm/amd/amdgpu/amdgpu_jpeg.c
>>>>> index b6d2eb049f54..70f1e0e88f4b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
>>>>> @@ -452,3 +452,62 @@ void amdgpu_jpeg_sysfs_reset_mask_fini(struct
>>>>> amdgpu_device *adev)
>>>>>                device_remove_file(adev->dev, 
>>>>> &dev_attr_jpeg_reset_mask);
>>>>>        }
>>>>>    }
>>>>> +
>>>>> +void amdgpu_jpeg_dump_ip_state(struct amdgpu_ip_block *ip_block,
>>>>> +                   const struct amdgpu_hwip_reg_entry *reg, u32
>>>>> reg_count)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = ip_block->adev;
>>>>> +    u32 inst_off, inst_id, is_powered;
>>>>> +    int i, j;
>>>>> +
>>>>> +    if (!adev->jpeg.ip_dump)
>>>>> +        return;
>>>>> +
>>>>> +    for (i = 0; i < adev->jpeg.num_jpeg_inst; i++) {
>>>>> +        if (adev->jpeg.harvest_config & (1 << i))
>>>>> +            continue;
>>>>> +
>>>>> +        inst_id = GET_INST(JPEG, i);
>>>>> +        inst_off = i * reg_count;
>>>>> +        /* check power status from UVD_JPEG_POWER_STATUS */
>>>>> +        adev->jpeg.ip_dump[inst_off] =
>>>>> RREG32(SOC15_REG_ENTRY_OFFSET_INST(reg[0],
>>>>> +                                          inst_id));
>>>>> +        is_powered = ((adev->jpeg.ip_dump[inst_off] & 0x1) != 1);
>>>>> +
>>>>> +        if (is_powered)
>>>>> +            for (j = 1; j < reg_count; j++)
>>>>> +                adev->jpeg.ip_dump[inst_off + j] =
>>>>> + RREG32(SOC15_REG_ENTRY_OFFSET_INST(reg[j],
>>>>> +                                       inst_id));
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +void amdgpu_jpeg_print_ip_state(struct amdgpu_ip_block *ip_block,
>>>>> struct drm_printer *p,
>>>>> +                const struct amdgpu_hwip_reg_entry *reg, u32 
>>>>> reg_count)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = ip_block->adev;
>>>>> +    u32 inst_off, is_powered;
>>>>> +    int i, j;
>>>>> +
>>>>> +    if (!adev->jpeg.ip_dump)
>>>>> +        return;
>>>>> +
>>>>> +    drm_printf(p, "num_instances:%d\n", adev->jpeg.num_jpeg_inst);
>>>>> +    for (i = 0; i < adev->jpeg.num_jpeg_inst; i++) {
>>>>> +        if (adev->jpeg.harvest_config & (1 << i)) {
>>>>> +            drm_printf(p, "\nHarvested Instance:JPEG%d Skipping
>>>>> dump\n", i);
>>>>> +            continue;
>>>>> +        }
>>>>> +
>>>>> +        inst_off = i * reg_count;
>>>>> +        is_powered = ((adev->jpeg.ip_dump[inst_off] & 0x1) != 1);
>>>>> +
>>>>> +        if (is_powered) {
>>>>> +            drm_printf(p, "Active Instance:JPEG%d\n", i);
>>>>> +            for (j = 0; j < reg_count; j++)
>>>>> +                drm_printf(p, "%-50s \t 0x%08x\n", reg[j].reg_name,
>>>>> +                       adev->jpeg.ip_dump[inst_off + j]);
>>>>> +        } else
>>>>> +            drm_printf(p, "\nInactive Instance:JPEG%d\n", i);
>>>>> +    }
>>>>> +}
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h b/drivers/gpu/
>>>>> drm/amd/amdgpu/amdgpu_jpeg.h
>>>>> index eb2096dcf1a6..1d334f35d8a8 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
>>>>> @@ -92,6 +92,8 @@
>>>>> *adev->jpeg.inst[inst_idx].dpg_sram_curr_addr++ = value;    \
>>>>>        } while (0)
>>>>>    +struct amdgpu_hwip_reg_entry;
>>>>> +
>>>>>    enum amdgpu_jpeg_caps {
>>>>>        AMDGPU_JPEG_RRMT_ENABLED,
>>>>>    };
>>>>> @@ -137,6 +139,7 @@ struct amdgpu_jpeg {
>>>>>        bool    indirect_sram;
>>>>>        uint32_t supported_reset;
>>>>>        uint32_t caps;
>>>>> +    u32 *ip_dump;
>>>> It's better to keep this at per jpeg instance level 
>>>> (amdgpu_jpeg_inst).
>>>> If the hang happens for a single jpeg ring, that will help rather than
>>>> dumping out reg settings for all instances.
>>> The devcoredump infra in amdgpu does not propagate the job itself or 
>>> the
>>> hung instance, so there is no feasible way of doing this, meaning
>>> dumping only the affected instance.
>>> And if every instance is represented by an ip_block then I can 
>>> implement
>>> this with a flag to handle per instance case, but I doubt it is, as 
>>> seen
>>> from the below commit
>>> fba4761ca00f drm/amdgpu: partially revert VCN IP block instancing 
>>> support
>>>
>> Even if the API in its current form doesn't support per instance dump,
>> suggest to keept the reg_dump data struct in jpeg_instance. That way it
>> doesn't need any calculation to find the right offset etc.
>>
>> On the other hand, a single copy may be maintained for the reg address
>> list if that is feasible. As I see the address calculation is dynamic
>> based on the instance id and it may not make sense to keep it in all
>> instances.
> I am aligned with you on maintaining a single copy of the reg_list, 
> and so, yes address calculation is dynamic
> based on the the instance id as you pointed out, in that case 
> maintaining this inside an instance tracking structure
> doesn't feel right, as multiple instances track the same reg_list 
> pointer which is the same list for all instances given an IP version.
>
> The request I am proposing is to maintain the reg_list inside 
> adev->jpeg, a common list of registers for any instance in the ip, and
> all the ip_blocks of type JPEG refer to this common list when separate 
> instances are represented by discrete ip_blocks, and dump per instance
> register list in the future when that is planned and enabled.
>
> Regards,
> Sathish

If I think through again, it seems to me as well that the reg_list is 
meant to be stored here considering it's relevance to ip_block.
I will rework the patch with your suggestion. Thank you.

Regards,
Sathish
>>
>> Thanks,
>> Lijo
>>
>>> Regards,
>>> Sathish
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>    };
>>>>>      int amdgpu_jpeg_sw_init(struct amdgpu_device *adev);
>>>>> @@ -161,5 +164,9 @@ int amdgpu_jpeg_psp_update_sram(struct
>>>>> amdgpu_device *adev, int inst_idx,
>>>>>    void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device 
>>>>> *adev);
>>>>>    int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev);
>>>>>    void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device 
>>>>> *adev);
>>>>> +void amdgpu_jpeg_dump_ip_state(struct amdgpu_ip_block *ip_block,
>>>>> +                   const struct amdgpu_hwip_reg_entry *reg, u32
>>>>> reg_count);
>>>>> +void amdgpu_jpeg_print_ip_state(struct amdgpu_ip_block *ip_block,
>>>>> struct drm_printer *p,
>>>>> +                const struct amdgpu_hwip_reg_entry *reg, u32
>>>>> reg_count);
>>>>>      #endif /*__AMDGPU_JPEG_H__*/
>