<div dir="auto"><div><br><div class="gmail_extra"><br><div class="gmail_quote">On 7 Jul. 2017 19:29, "Christian König" <<a href="mailto:deathsimple@vodafone.de">deathsimple@vodafone.de</a>> wrote:<br type="attribution"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">What tilling format have the destination textures?<br>
<br>
Sounds like the offset is just added so that we distribute memory accesses more equally over memory channels.<br></blockquote></div></div></div><div dir="auto"><br></div><div dir="auto">From the traces i think tile index mode was 10.</div><div dir="auto"><br></div><div dir="auto">Dave.</div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
Regards,<br>
Christian.<div class="elided-text"><br>
<br>
Am 07.07.2017 um 09:18 schrieb Dave Airlie:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
From: Dave Airlie <<a href="mailto:airlied@redhat.com" target="_blank">airlied@redhat.com</a>><br>
<br>
(this patch doesn't seem to work fully, hopefully AMD can tell us<br>
more info on the rules, and how to calculate the magic).<br>
<br>
It appears that to get full access to memory bandwidth with MRT<br>
rendering the pro vulkan driver seems to offset each image by 0x3800.<br>
I'm not sure how that value is calculated.<br>
<br>
Glenn came up with the idea (probably what -pro does also) of just<br>
offseting every image in round robin order, in the hope that apps<br>
would create mrt images in sequence anyways.<br>
<br>
This attempts to do that using an atomic counter in the device.<br>
<br>
This gets the deferred demo from 800fps->1150fps on my rx480.<br>
<br>
(I've tested dota2 and talos still run at least after this)<br>
---<br>
src/amd/vulkan/radv_device.c | 7 ++++---<br>
src/amd/vulkan/radv_image.c | 16 +++++++++++++++-<br>
src/amd/vulkan/radv_private.h | 3 +++<br>
3 files changed, 22 insertions(+), 4 deletions(-)<br>
<br>
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c<br>
index d1c519a..f39526d 100644<br>
--- a/src/amd/vulkan/radv_device.c<br>
+++ b/src/amd/vulkan/radv_device.c<br>
@@ -2706,7 +2706,7 @@ radv_initialise_color_surface(<wbr>struct radv_device *device,<br>
/* Intensity is implemented as Red, so treat it that way. */<br>
cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(des<wbr>c->swizzle[3] == VK_SWIZZLE_1);<br>
- va = device->ws->buffer_get_va(ivie<wbr>w->bo) + iview->image->offset;<br>
+ va = device->ws->buffer_get_va(ivie<wbr>w->bo) + iview->image->offset + iview->image->mrt_offset;<br>
if (device->physical_device->rad_<wbr>info.chip_class >= GFX9) {<br>
struct gfx9_surf_meta_flags meta;<br>
@@ -2756,11 +2756,11 @@ radv_initialise_color_surface(<wbr>struct radv_device *device,<br>
/* CMASK variables */<br>
va = device->ws->buffer_get_va(ivie<wbr>w->bo) + iview->image->offset;<br>
- va += iview->image->cmask.offset;<br>
+ va += iview->image->cmask.offset + iview->image->mrt_offset;<br>
cb->cb_color_cmask = va >> 8;<br>
va = device->ws->buffer_get_va(ivie<wbr>w->bo) + iview->image->offset;<br>
- va += iview->image->dcc_offset;<br>
+ va += iview->image->dcc_offset + iview->image->mrt_offset;<br>
cb->cb_dcc_base = va >> 8;<br>
uint32_t max_slice = radv_surface_layer_count(iview<wbr>);<br>
@@ -2776,6 +2776,7 @@ radv_initialise_color_surface(<wbr>struct radv_device *device,<br>
if (iview->image->fmask.size) {<br>
va = device->ws->buffer_get_va(ivie<wbr>w->bo) + iview->image->offset + iview->image->fmask.offset;<br>
+ va += iview->image->mrt_offset;<br>
cb->cb_color_fmask = va >> 8;<br>
} else {<br>
cb->cb_color_fmask = cb->cb_color_base;<br>
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c<br>
index b3a223b..bc20a53 100644<br>
--- a/src/amd/vulkan/radv_image.c<br>
+++ b/src/amd/vulkan/radv_image.c<br>
@@ -31,6 +31,7 @@<br>
#include "sid.h"<br>
#include "gfx9d.h"<br>
#include "util/debug.h"<br>
+#include "util/u_atomic.h"<br>
static unsigned<br>
radv_choose_tiling(struct radv_device *Device,<br>
const struct radv_image_create_info *create_info)<br>
@@ -208,6 +209,7 @@ si_set_mutable_tex_desc_fields<wbr>(struct radv_device *device,<br>
} else<br>
va += base_level_info->offset;<br>
+ va += image->mrt_offset;<br>
state[0] = va >> 8;<br>
state[1] &= C_008F14_BASE_ADDRESS_HI;<br>
state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);<br>
@@ -220,6 +222,7 @@ si_set_mutable_tex_desc_fields<wbr>(struct radv_device *device,<br>
state[7] = 0;<br>
if (image->surface.dcc_size && first_level < image->surface.num_dcc_levels) {<br>
uint64_t meta_va = gpu_address + image->dcc_offset;<br>
+ meta_va += image->mrt_offset;<br>
if (chip_class <= VI)<br>
meta_va += base_level_info->dcc_offset;<br>
state[6] |= S_008F28_COMPRESSION_EN(1);<br>
@@ -436,7 +439,7 @@ si_make_texture_descriptor(str<wbr>uct radv_device *device,<br>
uint64_t gpu_address = device->ws->buffer_get_va(imag<wbr>e->bo);<br>
uint64_t va;<br>
- va = gpu_address + image->offset + image->fmask.offset;<br>
+ va = gpu_address + image->offset + image->mrt_offset + image->fmask.offset;<br>
if (device->physical_device->rad_<wbr>info.chip_class >= GFX9) {<br>
fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK<wbr>;<br>
@@ -642,6 +645,7 @@ radv_image_alloc_fmask(struct radv_device *device,<br>
radv_image_get_fmask_info(devi<wbr>ce, image, image->info.samples, &image->fmask);<br>
image->fmask.offset = align64(image->size, image->fmask.alignment);<br>
+ image->fmask.size += image->mrt_offset;<br>
image->size = image->fmask.offset + image->fmask.size;<br>
image->alignment = MAX2(image->alignment, image->fmask.alignment);<br>
}<br>
@@ -709,6 +713,7 @@ radv_image_alloc_cmask(struct radv_device *device,<br>
radv_image_get_cmask_info(devi<wbr>ce, image, &image->cmask);<br>
image->cmask.offset = align64(image->size, image->cmask.alignment);<br>
+ image->cmask.size += image->mrt_offset;<br>
/* + 8 for storing the clear values */<br>
if (!image->clear_value_offset) {<br>
image->clear_value_offset = image->cmask.offset + image->cmask.size;<br>
@@ -724,6 +729,7 @@ radv_image_alloc_dcc(struct radv_device *device,<br>
{<br>
image->dcc_offset = align64(image->size, image->surface.dcc_alignment);<br>
/* + 16 for storing the clear values + dcc pred */<br>
+ image->surface.dcc_size += image->mrt_offset;<br>
image->clear_value_offset = image->dcc_offset + image->surface.dcc_size;<br>
image->dcc_pred_offset = image->clear_value_offset + 8;<br>
image->size = image->dcc_offset + image->surface.dcc_size + 16;<br>
@@ -801,6 +807,14 @@ radv_image_create(VkDevice _device,<br>
image->size = image->surface.surf_size;<br>
image->alignment = image->surface.surf_alignment;<br>
+ if ((pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMEN<wbr>T_BIT) && !create_info->scanout) {<br>
+ uint32_t mrt_idx = p_atomic_inc_return(&device->i<wbr>mage_mrt_offset_counter) - 1;<br>
+ mrt_idx %= 8;<br>
+ mrt_idx *= 0x3800;<br>
+ image->mrt_offset = mrt_idx;<br>
+ image->size += image->mrt_offset;<br>
+ }<br>
+<br>
if (image->exclusive || image->queue_family_mask == 1)<br>
can_cmask_dcc = true;<br>
diff --git a/src/amd/vulkan/radv_private.<wbr>h b/src/amd/vulkan/radv_private.<wbr>h<br>
index 5c30d18..f09095a 100644<br>
--- a/src/amd/vulkan/radv_private.<wbr>h<br>
+++ b/src/amd/vulkan/radv_private.<wbr>h<br>
@@ -547,6 +547,8 @@ struct radv_device {<br>
/* Backup in-memory cache to be used if the app doesn't provide one */<br>
struct radv_pipeline_cache * mem_cache;<br>
+<br>
+ uint32_t image_mrt_offset_counter;<br>
};<br>
struct radv_device_memory {<br>
@@ -1211,6 +1213,7 @@ struct radv_image {<br>
/* Set when bound */<br>
struct radeon_winsys_bo *bo;<br>
VkDeviceSize offset;<br>
+ VkDeviceSize mrt_offset;<br>
uint32_t dcc_offset;<br>
uint32_t htile_offset;<br>
struct radeon_surf surface;<br>
</blockquote>
<br>
<br>
</div></blockquote></div><br></div></div></div>