[PATCH] drm/amdkfd: add ACPI SRAT parsing for topology

Eric Huang jinhuieric.huang at amd.com
Mon May 3 19:27:22 UTC 2021



On 2021-05-03 3:13 p.m., Felix Kuehling wrote:
> Am 2021-05-03 um 10:47 a.m. schrieb Eric Huang:
>> In NPS4 BIOS we need to find the closest numa node when creating
>> topology io link between cpu and gpu, if PCI driver doesn't set
>> it.
>>
>> Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 95 ++++++++++++++++++++++++++-
>>   1 file changed, 93 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> index 38d45711675f..58c6738de774 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> @@ -1759,6 +1759,91 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
>>   	return 0;
>>   }
>>   
>> +#ifdef CONFIG_ACPI_NUMA
>> +static void kfd_find_numa_node_in_srat(struct kfd_dev *kdev)
>> +{
>> +	struct acpi_table_header *table_header = NULL;
>> +	struct acpi_subtable_header *sub_header = NULL;
>> +	unsigned long table_end, subtable_len;
>> +	u32 pci_id = pci_domain_nr(kdev->pdev->bus) << 16 |
>> +			pci_dev_id(kdev->pdev);
>> +	u32 bdf;
>> +	acpi_status status;
>> +	struct acpi_srat_cpu_affinity *cpu;
>> +	struct acpi_srat_generic_affinity *gpu;
>> +	int pxm = 0, max_pxm = 0;
>> +	int numa_node = NUMA_NO_NODE;
>> +	bool found = false;
>> +
>> +	/* Fetch the SRAT table from ACPI */
>> +	status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
>> +	if (status == AE_NOT_FOUND) {
>> +		pr_warn("SRAT table not found\n");
>> +		return;
>> +	} else if (ACPI_FAILURE(status)) {
>> +		const char *err = acpi_format_exception(status);
>> +		pr_err("SRAT table error: %s\n", err);
>> +		return;
>> +	}
>> +
>> +	table_end = (unsigned long)table_header + table_header->length;
>> +
>> +	/* Parse all entries looking for a match. */
>> +	sub_header = (struct acpi_subtable_header *)
>> +			((unsigned long)table_header +
>> +			sizeof(struct acpi_table_srat));
>> +	subtable_len = sub_header->length;
>> +
>> +	while (((unsigned long)sub_header) + subtable_len  < table_end) {
>> +		/*
>> +		 * If length is 0, break from this loop to avoid
>> +		 * infinite loop.
>> +		 */
>> +		if (subtable_len == 0) {
>> +			pr_err("SRAT invalid zero length\n");
>> +			break;
>> +		}
>> +
>> +		switch (sub_header->type) {
>> +		case ACPI_SRAT_TYPE_CPU_AFFINITY:
>> +			cpu = (struct acpi_srat_cpu_affinity *)sub_header;
>> +			pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
>> +					cpu->proximity_domain_lo;
>> +			if (pxm > max_pxm)
>> +				max_pxm = pxm;
>> +			break;
>> +		case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
>> +			gpu = (struct acpi_srat_generic_affinity *)sub_header;
>> +			bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
>> +					*((u16 *)(&gpu->device_handle[2]));
>> +			if (bdf == pci_id) {
>> +				found = true;
>> +				numa_node = pxm_to_node(gpu->proximity_domain);
>> +			}
>> +			break;
>> +		default:
>> +			break;
>> +		}
>> +
>> +		if (found)
>> +			break;
>> +
>> +		sub_header = (struct acpi_subtable_header *)
>> +				((unsigned long)sub_header + subtable_len);
>> +		subtable_len = sub_header->length;
>> +	}
>> +
>> +	acpi_put_table(table_header);
>> +
>> +	/* Workaround bad cpu-gpu binding case */
>> +	if (found && (numa_node < 0 || numa_node > max_pxm))
>> +		numa_node = 0;
>> +
>> +	if (numa_node != NUMA_NO_NODE)
>> +		set_dev_node(&kdev->pdev->dev, numa_node);
>> +}
>> +#endif
>> +
>>   /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
>>    * to its NUMA node
>>    *	@avail_size: Available size in the memory
>> @@ -1804,10 +1889,16 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
>>   	}
>>   
>>   	sub_type_hdr->proximity_domain_from = proximity_domain;
>> -#ifdef CONFIG_NUMA
>> +
>> +#ifdef CONFIG_ACPI_NUMA
>>   	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
>> +		kfd_find_numa_node_in_srat(kdev);
>> +#endif
>> +#ifdef CONFIG_NUMA
>> +	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) {
>>   		sub_type_hdr->proximity_domain_to = 0;
>> -	else
>> +		set_dev_node(&kdev->pdev->dev, 0);
> This should not be here. If you really want to lie about the NUMA node
> and pretend that it's 0 and not NO_NODE, then that should be done in
> kfd_find_numa_node_in_srat. That should be the only function that
> changes the dev->numa_node. Like Oak pointed out, eventually that should
> maybe not even be part of the driver. But I'm OK with keeping it as a
> fallback in the driver for the case that one a GPU doesn't have a NUMA
> node assigned by the kernel.
>
> But is setting the dev->numa_node to 0 really necessary? Does anything
> else in KFD depend on the dev->numa_node being 0? This function is only
> supposed to fill the proximity_domain in the CRAT table. Setting
> dev->numa_node is a side effect. If we can't figure out the correct NUMA
> node, we shouldn't just guess 0 in a way that potentially affects other
> parts of the kernel.
>
> Regards,
>    Felix
>
The reason I am adding it is for 
http://ontrack-internal.amd.com/browse/SWDEV-281376.

RCCL is using /sys/class/drm/card0/device/numa_node to determine numa 
node which GPU is close to. To keep consistence between KFD topology and 
pci sysfs exposure, I add it as a workaround in NPS1 and ACPI is not 
configured.

Thanks,
Eric

>> +	} else
>>   		sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
>>   #else
>>   	sub_type_hdr->proximity_domain_to = 0;



More information about the amd-gfx mailing list