[Mesa-dev] [PATCH 6/6] i965: Add a debug flag for counting cycles spent in each compiled shader.
Eric Anholt
eric at anholt.net
Fri Nov 30 13:24:20 PST 2012
This can be used for two purposes: Using hand-coded shaders to determine
per-instruction timings, or figuring out which shader to optimize in a
whole application. On the psychonauts trace, we see:
type ID cycles spent % of total
...
fs16 543: 339343898369 ( 339.34 Gcycles) 9.2%
fs16 521: 532215110990 ( 532.22 Gcycles) 14.4%
fs16 524: 1036231987390 (1036.23 Gcycles) 28.0%
confirming our previous understanding that fragment shaders are where
it's all at. But on GLBenchmark 2.7, we get:
fs16 69: 205928219888 ( 205.93 Gcycles) 7.5%
fs16 75: 364066413095 ( 364.07 Gcycles) 13.2%
vs 87: 1107217698878 (1107.22 Gcycles) 40.3%
That's interesting. I should look into that.
Note that this doesn't cover the instructions that set up the message to
the URB/FB write -- we'd need to convert the MRF usage in these
instructions to GRFs so that our offsets/times don't overwrite our
shader outputs.
---
src/mesa/drivers/dri/i965/brw_context.c | 3 +
src/mesa/drivers/dri/i965/brw_context.h | 28 ++++-
src/mesa/drivers/dri/i965/brw_defines.h | 20 +++-
src/mesa/drivers/dri/i965/brw_eu.h | 6 +-
src/mesa/drivers/dri/i965/brw_eu_emit.c | 55 +++++++++-
src/mesa/drivers/dri/i965/brw_fs.cpp | 101 +++++++++++++++++
src/mesa/drivers/dri/i965/brw_fs.h | 7 ++
src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 4 +
src/mesa/drivers/dri/i965/brw_program.c | 128 ++++++++++++++++++++++
src/mesa/drivers/dri/i965/brw_vec4.cpp | 81 ++++++++++++++
src/mesa/drivers/dri/i965/brw_vec4.h | 7 ++
src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 4 +
src/mesa/drivers/dri/i965/brw_vs_surface_state.c | 10 ++
src/mesa/drivers/dri/i965/brw_vtbl.c | 14 +++
src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 7 ++
src/mesa/drivers/dri/intel/intel_context.c | 6 +
src/mesa/drivers/dri/intel/intel_context.h | 1 +
17 files changed, 475 insertions(+), 7 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 4b1b247..5665a3a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -383,6 +383,9 @@ brwCreateContext(int api,
brw_fs_alloc_reg_sets(brw);
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ brw_init_shader_time(brw);
+
return true;
}
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1abaee3..dc25cab 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -559,14 +559,15 @@ struct brw_vs_prog_data {
#define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
#define SURF_INDEX_TEXTURE(t) (BRW_MAX_DRAW_BUFFERS + 2 + (t))
#define SURF_INDEX_WM_UBO(u) (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u)
-
+#define SURF_INDEX_WM_SHADER_TIME (SURF_INDEX_WM_UBO(12))
/** Maximum size of the binding table. */
-#define BRW_MAX_WM_SURFACES SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS)
+#define BRW_MAX_WM_SURFACES (SURF_INDEX_WM_SHADER_TIME + 1)
#define SURF_INDEX_VERT_CONST_BUFFER (0)
#define SURF_INDEX_VS_TEXTURE(t) (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t))
#define SURF_INDEX_VS_UBO(u) (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u)
-#define BRW_MAX_VS_SURFACES SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS)
+#define SURF_INDEX_VS_SHADER_TIME (SURF_INDEX_VS_UBO(12))
+#define BRW_MAX_VS_SURFACES (SURF_INDEX_VS_SHADER_TIME + 1)
#define SURF_INDEX_SOL_BINDING(t) ((t))
#define BRW_MAX_GS_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
@@ -651,6 +652,13 @@ struct brw_tracked_state {
void (*emit)( struct brw_context *brw );
};
+enum shader_time_shader_type {
+ ST_NONE,
+ ST_VS,
+ ST_FS8,
+ ST_FS16,
+};
+
/* Flags for brw->state.cache.
*/
#define CACHE_NEW_BLEND_STATE (1<<BRW_BLEND_STATE)
@@ -1089,6 +1097,16 @@ struct brw_context
uint32_t num_instances;
int basevertex;
+
+ struct {
+ drm_intel_bo *bo;
+ struct gl_shader_program **programs;
+ enum shader_time_shader_type *types;
+ uint64_t *cumulative;
+ int num_entries;
+ int max_entries;
+ double report_time;
+ } shader_time;
};
/*======================================================================
@@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
int brw_get_scratch_size(int size);
void brw_get_scratch_bo(struct intel_context *intel,
drm_intel_bo **scratch_bo, int size);
-
+void brw_init_shader_time(struct brw_context *brw);
+void brw_collect_and_report_shader_time(struct brw_context *brw);
+void brw_destroy_shader_time(struct brw_context *brw);
/* brw_urb.c
*/
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 6dc4707..b84d8f9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -665,6 +665,8 @@ enum opcode {
SHADER_OPCODE_TXS,
FS_OPCODE_TXB,
+ SHADER_OPCODE_SHADER_TIME_ADD,
+
FS_OPCODE_DDX,
FS_OPCODE_DDY,
FS_OPCODE_PIXEL_X,
@@ -729,6 +731,8 @@ enum opcode {
#define BRW_ARF_CONTROL 0x80
#define BRW_ARF_NOTIFICATION_COUNT 0x90
#define BRW_ARF_IP 0xA0
+#define BRW_ARF_TDR 0xB0
+#define BRW_ARF_TIMESTAMP 0xC0
#define BRW_MRF_COMPR4 (1 << 7)
@@ -956,7 +960,21 @@ enum brw_message_target {
#define BRW_SCRATCH_SPACE_SIZE_1M 10
#define BRW_SCRATCH_SPACE_SIZE_2M 11
-
+#define BRW_AOP_AND 1
+#define BRW_AOP_OR 2
+#define BRW_AOP_XOR 3
+#define BRW_AOP_MOV 4
+#define BRW_AOP_INC 5
+#define BRW_AOP_DEC 6
+#define BRW_AOP_ADD 7
+#define BRW_AOP_SUB 8
+#define BRW_AOP_REVSUB 9
+#define BRW_AOP_IMAX 10
+#define BRW_AOP_IMIN 11
+#define BRW_AOP_UMAX 12
+#define BRW_AOP_UMIN 13
+#define BRW_AOP_CMPWR 14
+#define BRW_AOP_PREDEC 15
#define CMD_URB_FENCE 0x6000
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index fce4f15..1415431 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file,
else if (file == BRW_MESSAGE_REGISTER_FILE)
assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
- assert(nr <= BRW_ARF_IP);
+ assert(nr <= BRW_ARF_TIMESTAMP);
reg.type = type;
reg.file = file;
@@ -1005,6 +1005,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
int num_regs,
GLuint offset);
+void brw_shader_time_add(struct brw_compile *p,
+ int mrf,
+ uint32_t surf_index);
+
/* If/else/endif. Works by manipulating the execution flags on each
* channel.
*/
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index eada9d9..87f1396 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -317,7 +317,8 @@ void brw_set_src1(struct brw_compile *p,
{
assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
- assert(reg.nr < 128);
+ if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+ assert(reg.nr < 128);
gen7_convert_mrf_to_grf(p, ®);
@@ -2430,3 +2431,55 @@ brw_svb_write(struct brw_compile *p,
0, /* end_of_thread */
send_commit_msg); /* send_commit_msg */
}
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32. So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_compile *p,
+ int base_mrf,
+ uint32_t surf_index)
+{
+ struct intel_context *intel = &p->brw->intel;
+ assert(intel->gen >= 7);
+
+ brw_push_insn_state(p);
+ brw_set_access_mode(p, BRW_ALIGN_1);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_pop_insn_state(p);
+
+ /* We use brw_vec1_reg and unmasked because we want to increment the given
+ * offset only once.
+ */
+ brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_NULL, 0));
+ brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+ base_mrf, 0));
+
+ bool header_present = false;
+ bool eot = false;
+ uint32_t mlen = 2; /* offset, value */
+ uint32_t rlen = 0;
+ brw_set_message_descriptor(p, send,
+ GEN7_SFID_DATAPORT_DATA_CACHE,
+ mlen, rlen, header_present, eot);
+
+ send->bits3.ud |= 6 << 14; /* untyped atomic op */
+ send->bits3.ud |= 0 << 13; /* no return data */
+ send->bits3.ud |= 1 << 12; /* SIMD8 mode */
+ send->bits3.ud |= BRW_AOP_ADD << 8;
+ send->bits3.ud |= surf_index << 0;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index e143e7a..2908618 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -401,6 +401,99 @@ fs_visitor::type_size(const struct glsl_type *type)
}
}
+fs_reg
+fs_visitor::get_timestamp()
+{
+ assert(intel->gen >= 7);
+
+ fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_TIMESTAMP,
+ 0),
+ BRW_REGISTER_TYPE_UD));
+
+ fs_reg dst = fs_reg(this, glsl_type::uint_type);
+
+ fs_inst *mov = emit(MOV(dst, ts));
+ /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+ * even if it's not enabled in the dispatch.
+ */
+ mov->force_writemask_all = true;
+ mov->force_uncompressed = true;
+
+ /* The caller wants the low 32 bits of the timestamp. Since it's running
+ * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+ * which is plenty of time for our purposes. It is identical across the
+ * EUs, but since it's tracking GPU core speed it will increment at a
+ * varying rate as render P-states change.
+ *
+ * The caller could also check if render P-states have changed (or anything
+ * else that might disrupt timing) by setting smear to 2 and checking if
+ * that field is != 0.
+ */
+ dst.smear = 0;
+
+ return dst;
+}
+
+void
+fs_visitor::emit_shader_time_begin()
+{
+ current_annotation = "shader time start";
+ shader_start_time = get_timestamp();
+}
+
+void
+fs_visitor::emit_shader_time_end()
+{
+ current_annotation = "shader time end";
+ /* Choose an index in the buffer and set up tracking information for our
+ * printouts.
+ */
+ int shader_time_index = brw->shader_time.num_entries++;
+ assert(shader_time_index <= brw->shader_time.max_entries);
+ if (dispatch_width == 8) {
+ brw->shader_time.types[shader_time_index] = ST_FS8;
+ } else {
+ assert(dispatch_width == 16);
+ brw->shader_time.types[shader_time_index] = ST_FS16;
+ }
+ if (prog) {
+ _mesa_reference_shader_program(ctx,
+ &brw->shader_time.programs[shader_time_index],
+ prog);
+ }
+
+ /* Take the current timestamp and get the delta. */
+ push_force_uncompressed();
+
+ fs_reg shader_end_time = get_timestamp();
+ shader_start_time.negate = true;
+ fs_reg diff = fs_reg(this, glsl_type::uint_type);
+ emit(ADD(diff, shader_start_time, shader_end_time));
+
+ /* If there were no instructions between the two timestamp gets, the diff
+ * is 2 cycles. Remove that overhead, so I can forget about that when
+ * trying to determine the time taken for single instructions.
+ */
+ emit(ADD(diff, diff, fs_reg(-2u)));
+
+ int base_mrf = 6;
+
+ fs_reg offset_mrf = fs_reg(MRF, base_mrf);
+ offset_mrf.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
+
+ fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
+ time_mrf.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(time_mrf, diff));
+
+ fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
+ inst->base_mrf = base_mrf;
+ inst->mlen = 2;
+
+ pop_force_uncompressed();
+}
+
void
fs_visitor::fail(const char *format, ...)
{
@@ -513,6 +606,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
return 1;
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ return 0;
case FS_OPCODE_FB_WRITE:
return 2;
case FS_OPCODE_PULL_CONSTANT_LOAD:
@@ -2139,6 +2234,9 @@ fs_visitor::run()
if (0) {
emit_dummy_fs();
} else {
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_begin();
+
calculate_urb_setup();
if (intel->gen < 6)
emit_interpolation_setup_gen4();
@@ -2162,6 +2260,9 @@ fs_visitor::run()
if (failed)
return false;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_end();
+
emit_fb_writes();
split_virtual_grfs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index dc85592..87af8f2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -353,6 +353,10 @@ public:
void emit_color_write(int target, int index, int first_color_mrf);
void emit_fb_writes();
+
+ void emit_shader_time_begin();
+ void emit_shader_time_end();
+
bool try_rewrite_rhs_to_dst(ir_assignment *ir,
fs_reg dst,
fs_reg src,
@@ -363,6 +367,8 @@ public:
void resolve_ud_negate(fs_reg *reg);
void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
+ fs_reg get_timestamp();
+
struct brw_reg interp_reg(int location, int channel);
int setup_uniform_values(int loc, const glsl_type *type);
void setup_builtin_uniform_values(ir_variable *ir);
@@ -424,6 +430,7 @@ public:
fs_reg pixel_w;
fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
+ fs_reg shader_start_time;
int grf_used;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 498b720..05b2f16 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -1029,6 +1029,10 @@ fs_generator::generate_code(exec_list *instructions)
generate_mov_dispatch_to_flags();
break;
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
+ break;
+
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 6bf5a6a..1ad186d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -189,3 +189,131 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
functions->LinkShader = brw_link_shader;
}
+
+void
+brw_init_shader_time(struct brw_context *brw)
+{
+ struct intel_context *intel = &brw->intel;
+
+ const int max_entries = 4096;
+ brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time",
+ max_entries * 4, 4096);
+ brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *,
+ max_entries);
+ brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
+ max_entries);
+ brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
+ max_entries);
+ brw->shader_time.max_entries = max_entries;
+}
+
+static int
+compare_time(const void *a, const void *b)
+{
+ uint64_t * const *a_val = a;
+ uint64_t * const *b_val = b;
+
+ /* We don't just subtract because we're turning the value to an int. */
+ if (**a_val < **b_val)
+ return -1;
+ else if (**a_val == **b_val)
+ return 0;
+ else
+ return 1;
+}
+
+static void
+brw_report_shader_time(struct brw_context *brw)
+{
+ if (!brw->shader_time.bo || !brw->shader_time.num_entries)
+ return;
+
+ uint64_t *sorted[brw->shader_time.num_entries];
+ double total = 0;
+ for (int i = 0; i < brw->shader_time.num_entries; i++) {
+ sorted[i] = &brw->shader_time.cumulative[i];
+ total += brw->shader_time.cumulative[i];
+ }
+
+ if (total == 0) {
+ printf("No shader time collected yet\n");
+ return;
+ }
+
+ qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
+
+ printf("\n");
+ printf("type ID cycles spent %% of total\n");
+ for (int s = 0; s < brw->shader_time.num_entries; s++) {
+ /* Work back from the sorted pointers times to a time to print. */
+ int i = sorted[s] - brw->shader_time.cumulative;
+
+ int shader_num = -1;
+ if (brw->shader_time.programs[i]) {
+ shader_num = brw->shader_time.programs[i]->Name;
+ }
+
+ switch (brw->shader_time.types[i]) {
+ case ST_VS:
+ printf("vs %4d: ", shader_num);
+ break;
+ case ST_FS8:
+ printf("fs8 %4d: ", shader_num);
+ break;
+ case ST_FS16:
+ printf("fs16 %4d: ", shader_num);
+ break;
+ default:
+ printf("other: ");
+ break;
+ }
+
+ printf("%16lld (%7.2f Gcycles) %4.1f%%\n",
+ (long long)brw->shader_time.cumulative[i],
+ (double)brw->shader_time.cumulative[i] / 1000000000.0,
+ (double)brw->shader_time.cumulative[i] / total * 100.0);
+ }
+}
+
+static void
+brw_collect_shader_time(struct brw_context *brw)
+{
+ if (!brw->shader_time.bo)
+ return;
+
+ /* This probably stalls on the last rendering. We could fix that by
+ * delaying reading the reports, but it doesn't look like it's a big
+ * overhead compared to the cost of tracking the time in the first place.
+ */
+ drm_intel_bo_map(brw->shader_time.bo, true);
+
+ uint32_t *times = brw->shader_time.bo->virtual;
+
+ for (int i = 0; i < brw->shader_time.num_entries; i++) {
+ brw->shader_time.cumulative[i] += times[i];
+ }
+
+ /* Zero the BO out to clear it out for our next collection.
+ */
+ memset(times, 0, brw->shader_time.bo->size);
+ drm_intel_bo_unmap(brw->shader_time.bo);
+}
+
+void
+brw_collect_and_report_shader_time(struct brw_context *brw)
+{
+ brw_collect_shader_time(brw);
+
+ if (brw->shader_time.report_time == 0 ||
+ get_time() - brw->shader_time.report_time >= 1.0) {
+ brw_report_shader_time(brw);
+ brw->shader_time.report_time = get_time();
+ }
+}
+
+void
+brw_destroy_shader_time(struct brw_context *brw)
+{
+ drm_intel_bo_unreference(brw->shader_time.bo);
+ brw->shader_time.bo = NULL;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 5200daa..20f924f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -26,6 +26,7 @@
extern "C" {
#include "main/macros.h"
+#include "main/shaderobj.h"
#include "program/prog_print.h"
#include "program/prog_parameter.h"
}
@@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
return 2;
case VS_OPCODE_SCRATCH_WRITE:
return 3;
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ return 0;
default:
assert(!"not reached");
return inst->mlen;
@@ -1039,9 +1042,84 @@ vec4_visitor::setup_payload(void)
this->first_non_payload_grf = reg;
}
+src_reg
+vec4_visitor::get_timestamp()
+{
+ assert(intel->gen >= 7);
+
+ src_reg ts = src_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+ BRW_ARF_TIMESTAMP,
+ 0),
+ BRW_REGISTER_TYPE_UD));
+
+ dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
+
+ vec4_instruction *mov = emit(MOV(dst, ts));
+ /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+ * even if it's not enabled in the dispatch.
+ */
+ mov->force_writemask_all = true;
+
+ return src_reg(dst);
+}
+
+void
+vec4_visitor::emit_shader_time_begin()
+{
+ current_annotation = "shader time start";
+ shader_start_time = get_timestamp();
+}
+
+void
+vec4_visitor::emit_shader_time_end()
+{
+ current_annotation = "shader time end";
+ /* Choose an index in the buffer and set up tracking information for our
+ * printouts.
+ */
+ int shader_time_index = brw->shader_time.num_entries++;
+ assert(shader_time_index <= brw->shader_time.max_entries);
+ brw->shader_time.types[shader_time_index] = ST_VS;
+ if (prog) {
+ _mesa_reference_shader_program(ctx,
+ &brw->shader_time.programs[shader_time_index],
+ prog);
+ }
+
+ /* Take the current timestamp and get the delta. */
+ src_reg shader_end_time = get_timestamp();
+ shader_start_time.negate = true;
+ dst_reg diff = dst_reg(this, glsl_type::uint_type);
+ emit(ADD(diff, shader_start_time, shader_end_time));
+
+ /* If there were no instructions between the two timestamp gets, the diff
+ * is 2 cycles. Remove that overhead, so I can forget about that when
+ * trying to determine the time taken for single instructions.
+ */
+ emit(ADD(diff, src_reg(diff), src_reg(-2u)));
+
+ int base_mrf = 6;
+
+ dst_reg offset_mrf = dst_reg(MRF, base_mrf);
+ offset_mrf.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
+
+ dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
+ time_mrf.type = BRW_REGISTER_TYPE_UD;
+ emit(MOV(time_mrf, src_reg(diff)));
+
+ vec4_instruction *inst;
+ inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
+ inst->base_mrf = base_mrf;
+ inst->mlen = 2;
+}
+
bool
vec4_visitor::run()
{
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_begin();
+
emit_attribute_fixups();
/* Generate VS IR for main(). (the visitor only descends into
@@ -1057,6 +1135,9 @@ vec4_visitor::run()
if (c->key.userclip_active && !c->key.uses_clip_distance)
setup_uniform_clipplane_values();
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ emit_shader_time_end();
+
emit_urb_writes();
/* Before any optimization, push array accesses out to scratch
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 8c6f56a..5192561 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -302,6 +302,8 @@ public:
int uniform_vector_size[MAX_UNIFORMS];
int uniforms;
+ src_reg shader_start_time;
+
struct hash_table *variable_ht;
bool run(void);
@@ -434,6 +436,9 @@ public:
void emit_urb_slot(int mrf, int vert_result);
void emit_urb_writes(void);
+ void emit_shader_time_begin();
+ void emit_shader_time_end();
+
src_reg get_scratch_offset(vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
src_reg get_pull_constant_offset(vec4_instruction *inst,
@@ -452,6 +457,8 @@ public:
bool try_emit_sat(ir_expression *ir);
void resolve_ud_negate(src_reg *reg);
+ src_reg get_timestamp();
+
bool process_move_condition(ir_rvalue *ir);
void dump_instruction(vec4_instruction *inst);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index b8a1b59..44d9186 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -658,6 +658,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction,
generate_pull_constant_load(inst, dst, src[0], src[1]);
break;
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME);
+ break;
+
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index d70c36e..3985b48 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = {
static void
brw_vs_upload_binding_table(struct brw_context *brw)
{
+ struct intel_context *intel = &brw->intel;
uint32_t *bind;
int i;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+ intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
+ brw->shader_time.bo->size,
+ &brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]);
+
+ assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME);
+ brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME;
+ }
+
/* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or
* pull constants.
*/
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 0da6070..f2f0e7b 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -43,6 +43,7 @@
#include "intel_fbo.h"
#include "brw_context.h"
+#include "brw_program.h"
#include "brw_defines.h"
#include "brw_state.h"
#include "brw_draw.h"
@@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel )
{
struct brw_context *brw = brw_context(&intel->ctx);
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+ brw_collect_and_report_shader_time(brw);
+ brw_destroy_shader_time(brw);
+ }
+
brw_destroy_state(brw);
brw_draw_destroy( brw );
@@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel )
* next batch.
*/
brw->cache.bo_used_by_gpu = true;
+
+ /* We need to periodically reap the shader time results, because rollover
+ * happens every few seconds. We also want to see results every once in a
+ * while, because many programs won't cleanly destroy our context, so the
+ * end-of-run printout may not happen.
+ */
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+ brw_collect_and_report_shader_time(brw);
}
static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index a7866d5..66301a9 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = {
static void
brw_upload_wm_binding_table(struct brw_context *brw)
{
+ struct intel_context *intel = &brw->intel;
uint32_t *bind;
int i;
+ if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+ intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
+ brw->shader_time.bo->size,
+ &brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]);
+ }
+
/* Might want to calculate nr_surfaces first, to avoid taking up so much
* space for the binding table.
*/
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 204609e..cf340db 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -491,6 +491,7 @@ static const struct dri_debug_control debug_control[] = {
{ "vs", DEBUG_VS },
{ "clip", DEBUG_CLIP },
{ "aub", DEBUG_AUB },
+ { "shader_time", DEBUG_SHADER_TIME },
{ NULL, 0 }
};
@@ -746,6 +747,11 @@ intelInitContext(struct intel_context *intel,
INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
if (INTEL_DEBUG & DEBUG_BUFMGR)
dri_bufmgr_set_debug(intel->bufmgr, true);
+ if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) {
+ fprintf(stderr,
+ "shader_time debugging requires gen7 (Ivybridge) or better.\n");
+ INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
+ }
if (INTEL_DEBUG & DEBUG_AUB)
drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true);
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index eeefadf..8d544dc 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -456,6 +456,7 @@ extern int INTEL_DEBUG;
#define DEBUG_VS 0x1000000
#define DEBUG_CLIP 0x2000000
#define DEBUG_AUB 0x4000000
+#define DEBUG_SHADER_TIME 0x8000000
#ifdef HAVE_ANDROID_PLATFORM
#define LOG_TAG "INTEL-MESA"
--
1.7.10.4
More information about the mesa-dev
mailing list