No subject

David M Nieto david.nieto at amd.com
Thu May 6 22:37:31 UTC 2021


During stress testing we found that with some Vulkan applications
the fence information displayed in the recently added fdinfo was not
properly calculated, two issues were discovered:

(1) A missing dma_put_fence on the loop that calculates the usage
ratios when the fence is being ignored.
(2) The approximation for the ratio calculation is not accurate
when accounting for non-active contexts. The fix is to ignore those
context if they have activity ratios lower than 0.01%

Attached is also a script demonstrating how the fdinfo can be used
to monitor gpu usage on running processes.

#!/usr/bin/env python3

#
# Copyright (C) 2021 Advanced Micro Devices. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of
# this software and associated documentation files (the "Software"), to
# deal in
# the Software without restriction, including without limitation the
# rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of
# the Software, and to permit persons to whom the Software is furnished
# to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
# IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

from tokenize import tokenize
import sys
import os
import pwd

total_mem = dict()
total_usage = dict()
def can_access(path):
    return os.access(path + "/fdinfo", os.X_OK)


def calc_perc(entry, metric):
    if not metric in entry:
        return 0.0
    if (type(entry[metric]) == list) :
        return sum(entry[metric])
    else :
        return entry[metric]

def process_pid(file):
    stat = dict()
    pasids = []

    for fd in os.scandir(file.path + "/fdinfo"):
        entry = {}
        with open(fd) as f:
            for line in f:
                entries = line.strip().split()
                if (entries[0] == "pdev:") :
                    entry["pdev"] = entries[1]
                elif (entries[0] == "pasid:") :
                    entry["pasid"] = entries[1]
                elif (entries[0] == "vram") :
                    entry["mem"] = int(entries[2])
                elif ("gfx" in entries[0]) :
                    if not "gfx" in entry :
                        entry["gfx"] = [0,0,0,0,0,0,0,0]
                    entry["gfx"][int(entries[0].lstrip("gfx").rstrip(":"))]
=
			float(entries[1].rstrip("%"))
                elif ("dma" in entries[0]) :
                    if not "dma" in entry :
                        entry["dma"] = [0,0,0,0,0,0,0,0]
                    entry["dma"][int(entries[0].lstrip("dma").rstrip(":"))]
=
			float(entries[1].rstrip("%"))
                elif ("dec" in entries[0]) :
                    if not "dec" in entry :
                        entry["dec"] = [0,0,0,0,0,0,0,0]
                    entry["dec"][int(entries[0].lstrip("dec").rstrip(":"))]
=
			float(entries[1].rstrip("%"))
                elif ("enc" in entries[0]) :
                    if not "enc" in entry :
                        entry["enc"] = [0,0,0,0,0,0,0,0]
                    entry["enc"][int(entries[0].lstrip("enc").rstrip(":"))]
=
			float(entries[1].rstrip("%"))
                elif ("compute" in entries[0]) :
                    if not "compute" in entry :
                        entry["compute"] = [0,0,0,0,0,0,0,0]
                    entry["compute"][int(entries[0].lstrip("compute").rstrip(":"))]
=
			float(entries[1].rstrip("%"))

            if not "pdev" in entry:
                continue
            if not "pasid" in entry :
                continue
            if (entry["pdev"], entry["pasid"]) in pasids:
              continue
            pasids.append((entry["pdev"], entry["pasid"]))

            pdev = entry["pdev"]

            if not pdev in stat:
                stat[pdev] = dict()

            if "mem" in entry :
                if "mem" in stat[pdev] :
                    stat[pdev]["mem"] = stat[pdev]["mem"] +
entry["mem"];
                else :
                    stat[pdev]["mem"] = entry["mem"]

            if "gfx" in entry :
                if "gfx" in stat[pdev] :
                    stat[pdev]["gfx"] = [a + b for a, b in
zip(stat[pdev]["gfx"],
			entry["gfx"])]
                else :
                    stat[pdev]["gfx"] = entry["gfx"]

            if "enc" in entry :
                if "enc" in stat[pdev] :
                    stat[pdev]["enc"] = [a + b for a, b in
zip(stat[pdev]["enc"],
			entry["enc"])]
                else :
                    stat[pdev]["enc"] = entry["enc"]

            if "dec" in entry :
                if "dec" in stat[pdev] :
                    stat[pdev]["dec"] = [a + b for a, b in
zip(stat[pdev]["dec"],
			entry["dec"])]
                else :
                    stat[pdev]["dec"] = entry["dec"]

            if "dma" in entry :
                if "dma" in stat[pdev] :
                    stat[pdev]["dma"] = [a + b for a, b in
zip(stat[pdev]["dma"],
			entry["dma"])]
                else :
                    stat[pdev]["dma"] = entry["dma"]

            if "compute" in entry :
                if "compute" in stat[pdev] :
                    stat[pdev]["compute"] = [a + b for a, b in
zip(stat[pdev]["compute"],
			entry["compute"])]
                else :
                    stat[pdev]["compute"] = entry["compute"]

    for gpu in stat:
        stat[gpu]["pid"] = file.name
        with open(file.path + "/comm") as f:
            stat[gpu]["name"] = f.readline().strip()

    if stat:
        for s in stat:
            if not s in total_mem:
                total_mem[s] = int(stat[s]["mem"])
            else:
                total_mem[s] = total_mem[s] + int(stat[s]["mem"])

            if not s in total_usage:
                total_usage[s] = dict()

            for key in stat[s]:
                if key == "mem":
                    continue
                if key == "name":
                    continue
                if key == "pid":
                    continue
                total = calc_perc(stat[s], key)

                if not key in total_usage[s]:
                    total_usage[s][key] = total
                else:
                    total_usage[s][key] = total + total_usage[s][key]

            # the /proc/PID is owned by process creator
            proc_stat_file = os.stat("/proc/%d" % int(stat[s]['pid']))
            # get UID via stat call
            uid = proc_stat_file.st_uid
            # look up the username from uid
            username = pwd.getpwuid(uid)[0]

            print("| {0:5s} | {1:16s} | {9:10s} | {2} | {3:7d} KiB |
{4:6.2f}  {5:6.2f}  {6:6.2f}  {7:6.2f}  {8:6.2f}  |"
                .format(stat[s]["pid"].ljust(5),
stat[s]["name"].ljust(16), s,
                stat[s]["mem"],
                calc_perc(stat[s], 'gfx'),
                calc_perc(stat[s], 'compute'),
                calc_perc(stat[s], 'dma'),
                calc_perc(stat[s], 'enc'),
                calc_perc(stat[s], 'dec'),
                username
                ))
            print("+-------+------------------+------------+--------------+-------------+-----------------------------------------+")

path = "/proc/"
print("+=======+==================+============+==============+=============+=========================================+")
print("| pid   | name             | user       | gpu bdf      | fb usage
| ring usage (%)                          |")
print("|       |                  |            |              |
| gfx     comp    dma     enc     dec     |")
print("+=======+==================+============+==============+=============+=========================================+")

for file in os.scandir(path):
    if (file.is_dir() and file.name.isnumeric()) :
        if (can_access(file.path)):
            process_pid(file)

for gpu in total_mem:
    print("|                                 TOTAL:| {0} | {1:7d} KiB |
{2:6.2f}  {3:6.2f}  {4:6.2f}  {5:6.2f}  {6:6.2f}  |".format(gpu,
total_mem[gpu],
        calc_perc(total_usage[gpu], 'gfx'),
        calc_perc(total_usage[gpu], 'compute'),
        calc_perc(total_usage[gpu], 'dma'),
        calc_perc(total_usage[gpu], 'enc'),
        calc_perc(total_usage[gpu], 'dec'),
        ))
print("+=======+==================+============+==============+=============+=====================+++=================+")






More information about the amd-gfx mailing list