<div dir="ltr"><div dir="ltr"><br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Sat, May 4, 2019 at 9:39 AM Nicolai Hähnle <<a href="mailto:nhaehnle@gmail.com">nhaehnle@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">From: Nicolai Hähnle <<a href="mailto:nicolai.haehnle@amd.com" target="_blank">nicolai.haehnle@amd.com</a>><br>
<br>
Upcoming changes to LLVM will emit LDS objects as symbols in the ELF<br>
symbol table, with relocations that will be resolved with this change.<br>
<br>
Callers will also be able to define LDS symbols that are shared between<br>
shader parts. This will be used by radeonsi for the ESGS ring in gfx9+<br>
merged shaders.<br>
---<br>
src/amd/common/ac_rtld.c | 210 ++++++++++++++++--<br>
src/amd/common/ac_rtld.h | 39 +++-<br>
src/gallium/drivers/radeonsi/si_compute.c | 9 +-<br>
src/gallium/drivers/radeonsi/si_debug.c | 22 +-<br>
src/gallium/drivers/radeonsi/si_shader.c | 61 +++--<br>
src/gallium/drivers/radeonsi/si_shader.h | 5 +-<br>
.../drivers/radeonsi/si_state_shaders.c | 2 +-<br>
7 files changed, 296 insertions(+), 52 deletions(-)<br>
<br>
diff --git a/src/amd/common/ac_rtld.c b/src/amd/common/ac_rtld.c<br>
index 4e0468d2062..3df7b3ba51f 100644<br>
--- a/src/amd/common/ac_rtld.c<br>
+++ b/src/amd/common/ac_rtld.c<br>
@@ -24,25 +24,31 @@<br>
#include "ac_rtld.h"<br>
<br>
#include <gelf.h><br>
#include <libelf.h><br>
#include <stdarg.h><br>
#include <stdio.h><br>
#include <stdlib.h><br>
#include <string.h><br>
<br>
#include "ac_binary.h"<br>
+#include "ac_gpu_info.h"<br>
+#include "util/u_dynarray.h"<br>
#include "util/u_math.h"<br>
<br>
// Old distributions may not have this enum constant<br>
#define MY_EM_AMDGPU 224<br>
<br>
+#ifndef STT_AMDGPU_LDS<br>
+#define STT_AMDGPU_LDS 13<br>
+#endif<br>
+<br>
#ifndef R_AMDGPU_NONE<br>
#define R_AMDGPU_NONE 0<br>
#define R_AMDGPU_ABS32_LO 1<br>
#define R_AMDGPU_ABS32_HI 2<br>
#define R_AMDGPU_ABS64 3<br>
#define R_AMDGPU_REL32 4<br>
#define R_AMDGPU_REL64 5<br>
#define R_AMDGPU_ABS32 6<br>
#define R_AMDGPU_GOTPCREL 7<br>
#define R_AMDGPU_GOTPCREL32_LO 8<br>
@@ -97,41 +103,155 @@ static void report_elf_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);<br>
static void report_elf_errorf(const char *fmt, ...)<br>
{<br>
va_list va;<br>
va_start(va, fmt);<br>
report_erroraf(fmt, va);<br>
va_end(va);<br>
<br>
fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));<br>
}<br>
<br>
+/**<br>
+ * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader<br>
+ * \p part_idx.<br>
+ */<br>
+static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,<br>
+ const char *name, unsigned part_idx)<br>
+{<br>
+ util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {<br>
+ if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&<br>
+ !strcmp(name, symbol->name))<br>
+ return symbol;<br>
+ }<br>
+ return 0;<br>
+}<br>
+<br>
+static int compare_symbol_by_align(const void *lhsp, const void *rhsp)<br>
+{<br>
+ const struct ac_rtld_symbol *lhs = lhsp;<br>
+ const struct ac_rtld_symbol *rhs = rhsp;<br>
+ if (rhs->align > lhs->align)<br>
+ return -1;<br>
+ if (rhs->align < lhs->align)<br>
+ return 1;<br>
+ return 0;<br>
+}<br>
+<br>
+/**<br>
+ * Sort the given symbol list by decreasing alignment and assign offsets.<br>
+ */<br>
+static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,<br>
+ uint64_t *ptotal_size)<br>
+{<br>
+ qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);<br>
+<br>
+ uint64_t total_size = *ptotal_size;<br>
+<br>
+ for (unsigned i = 0; i < num_symbols; ++i) {<br>
+ struct ac_rtld_symbol *s = &symbols[i];<br>
+ assert(util_is_power_of_two_nonzero(s->align));<br>
+<br>
+ total_size = align64(total_size, s->align);<br>
+ s->offset = total_size;<br>
+<br>
+ if (total_size + s->size < total_size) {<br>
+ report_errorf("%s: size overflow", __FUNCTION__);<br>
+ return false;<br>
+ }<br>
+<br>
+ total_size += s->size;<br>
+ }<br>
+<br>
+ *ptotal_size = total_size;<br>
+ return true;<br>
+}<br>
+<br>
+/**<br>
+ * Read LDS symbols from the given \p section of the ELF of \p part and append<br>
+ * them to the LDS symbols list.<br>
+ *<br>
+ * Shared LDS symbols are filtered out.<br>
+ */<br>
+static bool read_private_lds_symbols(struct ac_rtld_binary *binary,<br>
+ unsigned part_idx,<br>
+ Elf_Scn *section,<br>
+ uint32_t *lds_end_align)<br>
+{<br>
+#define report_elf_if(cond) \<br>
+ do { \<br>
+ if ((cond)) { \<br>
+ report_errorf(#cond); \<br>
+ return false; \<br>
+ } \<br>
+ } while (false)<br>
+<br>
+ struct ac_rtld_part *part = &binary->parts[part_idx];<br>
+ Elf64_Shdr *shdr = elf64_getshdr(section);<br>
+ uint32_t strtabidx = shdr->sh_link;<br>
+ Elf_Data *symbols_data = elf_getdata(section, NULL);<br>
+ report_elf_if(!symbols_data);<br>
+<br>
+ const Elf64_Sym *symbol = symbols_data->d_buf;<br>
+ size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);<br>
+<br>
+ for (size_t j = 0; j < num_symbols; ++j, ++symbol) {<br>
+ if (ELF64_ST_TYPE(symbol->st_info) != STT_AMDGPU_LDS)<br>
+ continue;<br>
+<br>
+ report_elf_if(symbol->st_size > 1u << 29);<br>
+<br>
+ struct ac_rtld_symbol s = {};<br>
+ <a href="http://s.name" rel="noreferrer" target="_blank">s.name</a> = elf_strptr(part->elf, strtabidx, symbol->st_name);<br>
+ s.size = symbol->st_size;<br>
+ s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);<br>
+ s.part_idx = part_idx;<br>
+<br>
+ if (!strcmp(<a href="http://s.name" rel="noreferrer" target="_blank">s.name</a>, "__lds_end")) {<br>
+ report_elf_if(s.size != 0);<br>
+ *lds_end_align = MAX2(*lds_end_align, s.align);<br>
+ continue;<br>
+ }<br>
+<br>
+ const struct ac_rtld_symbol *shared =<br>
+ find_symbol(&binary->lds_symbols, <a href="http://s.name" rel="noreferrer" target="_blank">s.name</a>, part_idx);<br>
+ if (shared) {<br>
+ report_elf_if(s.align > shared->align);<br>
+ report_elf_if(s.size > shared->size);<br>
+ continue;<br>
+ }<br>
+<br>
+ util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);<br>
+ }<br>
+<br>
+ return true;<br>
+<br>
+#undef report_elf_if<br>
+}<br>
+<br>
/**<br>
* Open a binary consisting of one or more shader parts.<br>
*<br>
* \param binary the uninitialized struct<br>
- * \param num_parts number of shader parts<br>
- * \param elf_ptrs pointers to the in-memory ELF objects for each shader part<br>
- * \param elf_sizes sizes (in bytes) of the in-memory ELF objects<br>
+ * \param i binary opening parameters<br>
*/<br>
-bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,<br>
- const char * const *elf_ptrs,<br>
- const uint64_t *elf_sizes)<br>
+bool ac_rtld_open(struct ac_rtld_binary *binary,<br>
+ struct ac_rtld_open_info i)<br>
{<br>
/* One of the libelf implementations<br>
* (<a href="http://www.mr511.de/software/english.htm" rel="noreferrer" target="_blank">http://www.mr511.de/software/english.htm</a>) requires calling<br>
* elf_version() before elf_memory().<br>
*/<br>
elf_version(EV_CURRENT);<br>
<br>
memset(binary, 0, sizeof(*binary));<br>
- binary->num_parts = num_parts;<br>
- binary->parts = calloc(sizeof(*binary->parts), num_parts);<br>
+ binary->num_parts = i.num_parts;<br>
+ binary->parts = calloc(sizeof(*binary->parts), i.num_parts);<br>
if (!binary->parts)<br>
return false;<br>
<br>
uint64_t pasted_text_size = 0;<br>
uint64_t rx_align = 1;<br>
uint64_t rx_size = 0;<br>
<br>
#define report_if(cond) \<br>
do { \<br>
if ((cond)) { \<br>
@@ -140,25 +260,44 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, unsigned num_parts,<br>
} \<br>
} while (false)<br>
#define report_elf_if(cond) \<br>
do { \<br>
if ((cond)) { \<br>
report_elf_errorf(#cond); \<br>
goto fail; \<br>
} \<br>
} while (false)<br>
<br>
- /* First pass over all parts: open ELFs and determine the placement of<br>
- * sections in the memory image. */<br>
- for (unsigned i = 0; i < num_parts; ++i) {<br>
- struct ac_rtld_part *part = &binary->parts[i];<br>
- part->elf = elf_memory((char *)elf_ptrs[i], elf_sizes[i]);<br>
+ /* Copy and layout shared LDS symbols. */<br>
+ util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol, i.num_shared_lds_symbols);<br>
+ memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);<br>
+<br>
+ util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)<br>
+ symbol->part_idx = ~0u;<br>
+<br>
+ unsigned max_lds_size = i.info->chip_class >= CIK ? 64 * 1024 : 32 * 1024;<br></blockquote><div><br></div><div>For ESGS and LSHS, the max LDS size is 32K, because the other half is reserved for PS. A GPU hang can occur if more LDS is used for those stages.<br></div><div><br></div><div>Marek</div><br></div></div>