[PATCH] drm/amdkfd: add ACPI SRAT parsing for topology
Felix Kuehling
felix.kuehling at amd.com
Mon May 3 19:13:34 UTC 2021
Am 2021-05-03 um 10:47 a.m. schrieb Eric Huang:
> In NPS4 BIOS we need to find the closest numa node when creating
> topology io link between cpu and gpu, if PCI driver doesn't set
> it.
>
> Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 95 ++++++++++++++++++++++++++-
> 1 file changed, 93 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 38d45711675f..58c6738de774 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -1759,6 +1759,91 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
> return 0;
> }
>
> +#ifdef CONFIG_ACPI_NUMA
> +static void kfd_find_numa_node_in_srat(struct kfd_dev *kdev)
> +{
> + struct acpi_table_header *table_header = NULL;
> + struct acpi_subtable_header *sub_header = NULL;
> + unsigned long table_end, subtable_len;
> + u32 pci_id = pci_domain_nr(kdev->pdev->bus) << 16 |
> + pci_dev_id(kdev->pdev);
> + u32 bdf;
> + acpi_status status;
> + struct acpi_srat_cpu_affinity *cpu;
> + struct acpi_srat_generic_affinity *gpu;
> + int pxm = 0, max_pxm = 0;
> + int numa_node = NUMA_NO_NODE;
> + bool found = false;
> +
> + /* Fetch the SRAT table from ACPI */
> + status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
> + if (status == AE_NOT_FOUND) {
> + pr_warn("SRAT table not found\n");
> + return;
> + } else if (ACPI_FAILURE(status)) {
> + const char *err = acpi_format_exception(status);
> + pr_err("SRAT table error: %s\n", err);
> + return;
> + }
> +
> + table_end = (unsigned long)table_header + table_header->length;
> +
> + /* Parse all entries looking for a match. */
> + sub_header = (struct acpi_subtable_header *)
> + ((unsigned long)table_header +
> + sizeof(struct acpi_table_srat));
> + subtable_len = sub_header->length;
> +
> + while (((unsigned long)sub_header) + subtable_len < table_end) {
> + /*
> + * If length is 0, break from this loop to avoid
> + * infinite loop.
> + */
> + if (subtable_len == 0) {
> + pr_err("SRAT invalid zero length\n");
> + break;
> + }
> +
> + switch (sub_header->type) {
> + case ACPI_SRAT_TYPE_CPU_AFFINITY:
> + cpu = (struct acpi_srat_cpu_affinity *)sub_header;
> + pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
> + cpu->proximity_domain_lo;
> + if (pxm > max_pxm)
> + max_pxm = pxm;
> + break;
> + case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
> + gpu = (struct acpi_srat_generic_affinity *)sub_header;
> + bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
> + *((u16 *)(&gpu->device_handle[2]));
> + if (bdf == pci_id) {
> + found = true;
> + numa_node = pxm_to_node(gpu->proximity_domain);
> + }
> + break;
> + default:
> + break;
> + }
> +
> + if (found)
> + break;
> +
> + sub_header = (struct acpi_subtable_header *)
> + ((unsigned long)sub_header + subtable_len);
> + subtable_len = sub_header->length;
> + }
> +
> + acpi_put_table(table_header);
> +
> + /* Workaround bad cpu-gpu binding case */
> + if (found && (numa_node < 0 || numa_node > max_pxm))
> + numa_node = 0;
> +
> + if (numa_node != NUMA_NO_NODE)
> + set_dev_node(&kdev->pdev->dev, numa_node);
> +}
> +#endif
> +
> /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
> * to its NUMA node
> * @avail_size: Available size in the memory
> @@ -1804,10 +1889,16 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
> }
>
> sub_type_hdr->proximity_domain_from = proximity_domain;
> -#ifdef CONFIG_NUMA
> +
> +#ifdef CONFIG_ACPI_NUMA
> if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
> + kfd_find_numa_node_in_srat(kdev);
> +#endif
> +#ifdef CONFIG_NUMA
> + if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) {
> sub_type_hdr->proximity_domain_to = 0;
> - else
> + set_dev_node(&kdev->pdev->dev, 0);
This should not be here. If you really want to lie about the NUMA node
and pretend that it's 0 and not NO_NODE, then that should be done in
kfd_find_numa_node_in_srat. That should be the only function that
changes the dev->numa_node. Like Oak pointed out, eventually that should
maybe not even be part of the driver. But I'm OK with keeping it as a
fallback in the driver for the case that one a GPU doesn't have a NUMA
node assigned by the kernel.
But is setting the dev->numa_node to 0 really necessary? Does anything
else in KFD depend on the dev->numa_node being 0? This function is only
supposed to fill the proximity_domain in the CRAT table. Setting
dev->numa_node is a side effect. If we can't figure out the correct NUMA
node, we shouldn't just guess 0 in a way that potentially affects other
parts of the kernel.
Regards,
Felix
> + } else
> sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
> #else
> sub_type_hdr->proximity_domain_to = 0;
More information about the amd-gfx
mailing list