No subject
David M Nieto
david.nieto at amd.com
Thu May 6 22:37:31 UTC 2021
During stress testing we found that with some Vulkan applications
the fence information displayed in the recently added fdinfo was not
properly calculated, two issues were discovered:
(1) A missing dma_put_fence on the loop that calculates the usage
ratios when the fence is being ignored.
(2) The approximation for the ratio calculation is not accurate
when accounting for non-active contexts. The fix is to ignore those
context if they have activity ratios lower than 0.01%
Attached is also a script demonstrating how the fdinfo can be used
to monitor gpu usage on running processes.
#!/usr/bin/env python3
#
# Copyright (C) 2021 Advanced Micro Devices. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of
# this software and associated documentation files (the "Software"), to
# deal in
# the Software without restriction, including without limitation the
# rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of
# the Software, and to permit persons to whom the Software is furnished
# to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
# IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
from tokenize import tokenize
import sys
import os
import pwd
total_mem = dict()
total_usage = dict()
def can_access(path):
return os.access(path + "/fdinfo", os.X_OK)
def calc_perc(entry, metric):
if not metric in entry:
return 0.0
if (type(entry[metric]) == list) :
return sum(entry[metric])
else :
return entry[metric]
def process_pid(file):
stat = dict()
pasids = []
for fd in os.scandir(file.path + "/fdinfo"):
entry = {}
with open(fd) as f:
for line in f:
entries = line.strip().split()
if (entries[0] == "pdev:") :
entry["pdev"] = entries[1]
elif (entries[0] == "pasid:") :
entry["pasid"] = entries[1]
elif (entries[0] == "vram") :
entry["mem"] = int(entries[2])
elif ("gfx" in entries[0]) :
if not "gfx" in entry :
entry["gfx"] = [0,0,0,0,0,0,0,0]
entry["gfx"][int(entries[0].lstrip("gfx").rstrip(":"))]
=
float(entries[1].rstrip("%"))
elif ("dma" in entries[0]) :
if not "dma" in entry :
entry["dma"] = [0,0,0,0,0,0,0,0]
entry["dma"][int(entries[0].lstrip("dma").rstrip(":"))]
=
float(entries[1].rstrip("%"))
elif ("dec" in entries[0]) :
if not "dec" in entry :
entry["dec"] = [0,0,0,0,0,0,0,0]
entry["dec"][int(entries[0].lstrip("dec").rstrip(":"))]
=
float(entries[1].rstrip("%"))
elif ("enc" in entries[0]) :
if not "enc" in entry :
entry["enc"] = [0,0,0,0,0,0,0,0]
entry["enc"][int(entries[0].lstrip("enc").rstrip(":"))]
=
float(entries[1].rstrip("%"))
elif ("compute" in entries[0]) :
if not "compute" in entry :
entry["compute"] = [0,0,0,0,0,0,0,0]
entry["compute"][int(entries[0].lstrip("compute").rstrip(":"))]
=
float(entries[1].rstrip("%"))
if not "pdev" in entry:
continue
if not "pasid" in entry :
continue
if (entry["pdev"], entry["pasid"]) in pasids:
continue
pasids.append((entry["pdev"], entry["pasid"]))
pdev = entry["pdev"]
if not pdev in stat:
stat[pdev] = dict()
if "mem" in entry :
if "mem" in stat[pdev] :
stat[pdev]["mem"] = stat[pdev]["mem"] +
entry["mem"];
else :
stat[pdev]["mem"] = entry["mem"]
if "gfx" in entry :
if "gfx" in stat[pdev] :
stat[pdev]["gfx"] = [a + b for a, b in
zip(stat[pdev]["gfx"],
entry["gfx"])]
else :
stat[pdev]["gfx"] = entry["gfx"]
if "enc" in entry :
if "enc" in stat[pdev] :
stat[pdev]["enc"] = [a + b for a, b in
zip(stat[pdev]["enc"],
entry["enc"])]
else :
stat[pdev]["enc"] = entry["enc"]
if "dec" in entry :
if "dec" in stat[pdev] :
stat[pdev]["dec"] = [a + b for a, b in
zip(stat[pdev]["dec"],
entry["dec"])]
else :
stat[pdev]["dec"] = entry["dec"]
if "dma" in entry :
if "dma" in stat[pdev] :
stat[pdev]["dma"] = [a + b for a, b in
zip(stat[pdev]["dma"],
entry["dma"])]
else :
stat[pdev]["dma"] = entry["dma"]
if "compute" in entry :
if "compute" in stat[pdev] :
stat[pdev]["compute"] = [a + b for a, b in
zip(stat[pdev]["compute"],
entry["compute"])]
else :
stat[pdev]["compute"] = entry["compute"]
for gpu in stat:
stat[gpu]["pid"] = file.name
with open(file.path + "/comm") as f:
stat[gpu]["name"] = f.readline().strip()
if stat:
for s in stat:
if not s in total_mem:
total_mem[s] = int(stat[s]["mem"])
else:
total_mem[s] = total_mem[s] + int(stat[s]["mem"])
if not s in total_usage:
total_usage[s] = dict()
for key in stat[s]:
if key == "mem":
continue
if key == "name":
continue
if key == "pid":
continue
total = calc_perc(stat[s], key)
if not key in total_usage[s]:
total_usage[s][key] = total
else:
total_usage[s][key] = total + total_usage[s][key]
# the /proc/PID is owned by process creator
proc_stat_file = os.stat("/proc/%d" % int(stat[s]['pid']))
# get UID via stat call
uid = proc_stat_file.st_uid
# look up the username from uid
username = pwd.getpwuid(uid)[0]
print("| {0:5s} | {1:16s} | {9:10s} | {2} | {3:7d} KiB |
{4:6.2f} {5:6.2f} {6:6.2f} {7:6.2f} {8:6.2f} |"
.format(stat[s]["pid"].ljust(5),
stat[s]["name"].ljust(16), s,
stat[s]["mem"],
calc_perc(stat[s], 'gfx'),
calc_perc(stat[s], 'compute'),
calc_perc(stat[s], 'dma'),
calc_perc(stat[s], 'enc'),
calc_perc(stat[s], 'dec'),
username
))
print("+-------+------------------+------------+--------------+-------------+-----------------------------------------+")
path = "/proc/"
print("+=======+==================+============+==============+=============+=========================================+")
print("| pid | name | user | gpu bdf | fb usage
| ring usage (%) |")
print("| | | | |
| gfx comp dma enc dec |")
print("+=======+==================+============+==============+=============+=========================================+")
for file in os.scandir(path):
if (file.is_dir() and file.name.isnumeric()) :
if (can_access(file.path)):
process_pid(file)
for gpu in total_mem:
print("| TOTAL:| {0} | {1:7d} KiB |
{2:6.2f} {3:6.2f} {4:6.2f} {5:6.2f} {6:6.2f} |".format(gpu,
total_mem[gpu],
calc_perc(total_usage[gpu], 'gfx'),
calc_perc(total_usage[gpu], 'compute'),
calc_perc(total_usage[gpu], 'dma'),
calc_perc(total_usage[gpu], 'enc'),
calc_perc(total_usage[gpu], 'dec'),
))
print("+=======+==================+============+==============+=============+=====================+++=================+")
More information about the amd-gfx
mailing list