[Mesa-dev] [PATCH 11/18] radeonsi: use a clever alignment for constant buffer uploads
Marek Olšák
maraeo at gmail.com
Thu Feb 16 12:53:03 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
This results in a very tiny decrease in lgkm wait cycles.
---
src/gallium/drivers/radeon/radeon_winsys.h | 1 +
src/gallium/drivers/radeonsi/si_descriptors.c | 4 +++-
src/gallium/drivers/radeonsi/si_pipe.h | 15 +++++++++++++++
src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 1 +
src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 1 +
5 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 432550d..812c036 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -194,20 +194,21 @@ struct radeon_info {
bool gfx_ib_pad_with_type2;
bool has_sdma;
bool has_uvd;
uint32_t uvd_fw_version;
uint32_t vce_fw_version;
uint32_t me_fw_version;
uint32_t pfp_fw_version;
uint32_t ce_fw_version;
uint32_t vce_harvest_config;
uint32_t clock_crystal_freq;
+ uint32_t tcc_cache_line_size;
/* Kernel info. */
uint32_t drm_major; /* version */
uint32_t drm_minor;
uint32_t drm_patchlevel;
bool has_userptr;
/* Shader cores. */
uint32_t r600_max_quad_pipes; /* wave size / 16 */
uint32_t max_shader_clock;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 8f636af..72b33f3 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1040,21 +1040,23 @@ static struct si_descriptors *
si_const_buffer_descriptors(struct si_context *sctx, unsigned shader)
{
return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)];
}
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
const uint8_t *ptr, unsigned size, uint32_t *const_offset)
{
void *tmp;
- u_upload_alloc(sctx->b.b.stream_uploader, 0, size, 256, const_offset,
+ u_upload_alloc(sctx->b.b.stream_uploader, 0, size,
+ si_optimal_tcc_alignment(sctx, size),
+ const_offset,
(struct pipe_resource**)rbuffer, &tmp);
if (*rbuffer)
util_memcpy_cpu_to_le32(tmp, ptr, size);
}
static void si_set_constant_buffer(struct si_context *sctx,
struct si_buffer_resources *buffers,
unsigned descriptors_idx,
uint slot, const struct pipe_constant_buffer *input)
{
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index fb24bab..bee6881 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -505,11 +505,26 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
static inline bool si_vs_exports_prim_id(struct si_shader *shader)
{
if (shader->selector->type == PIPE_SHADER_VERTEX)
return shader->key.part.vs.epilog.export_prim_id;
else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
return shader->key.part.tes.epilog.export_prim_id;
else
return false;
}
+static inline unsigned
+si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
+{
+ unsigned alignment, tcc_cache_line_size;
+
+ /* If the upload size is less than the cache line size (e.g. 16, 32),
+ * the whole thing will fit into a cache line if we align it to its size.
+ * The idea is that multiple small uploads can share a cache line.
+ * If the upload size is greater, align it to the cache line size.
+ */
+ alignment = util_next_power_of_two(upload_size);
+ tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
+ return MIN2(alignment, tcc_cache_line_size);
+}
+
#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index db0087c..6511c48 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -338,20 +338,21 @@ static bool do_winsys_init(struct amdgpu_winsys *ws, int fd)
ws->info.max_se = ws->amdinfo.num_shader_engines;
ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
ws->info.has_uvd = uvd.available_rings != 0;
ws->info.uvd_fw_version =
uvd.available_rings ? uvd_version : 0;
ws->info.vce_fw_version =
vce.available_rings ? vce_version : 0;
ws->info.has_userptr = true;
ws->info.num_render_backends = ws->amdinfo.rb_pipes;
ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
+ ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7);
ws->info.has_virtual_memory = true;
ws->info.has_sdma = dma.available_rings != 0;
/* Get the number of good compute units. */
ws->info.num_good_compute_units = 0;
for (i = 0; i < ws->info.max_se; i++)
for (j = 0; j < ws->info.max_sh_per_se; j++)
ws->info.num_good_compute_units +=
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index bdcf194..7fde04e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -534,20 +534,21 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
return false;
}
}
/* Hawaii with old firmware needs type2 nop packet.
* accel_working2 with value 3 indicates the new firmware.
*/
ws->info.gfx_ib_pad_with_type2 = ws->info.chip_class <= SI ||
(ws->info.family == CHIP_HAWAII &&
ws->accel_working2 < 3);
+ ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL;
return true;
}
static void radeon_winsys_destroy(struct radeon_winsys *rws)
{
struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)rws;
--
2.7.4
More information about the mesa-dev
mailing list