Mesa (nv50-compiler): nv50: use actual loads/ stores if TEMPs are accessed indirectly

Christoph Bumiller chrisbmr at kemper.freedesktop.org
Thu Sep 9 17:21:34 UTC 2010


Module: Mesa
Branch: nv50-compiler
Commit: f30810cb68a53c4fef360778a230126ed0ee0ee3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f30810cb68a53c4fef360778a230126ed0ee0ee3

Author: Christoph Bumiller <e0425955 at student.tuwien.ac.at>
Date:   Thu Sep  9 19:12:54 2010 +0200

nv50: use actual loads/stores if TEMPs are accessed indirectly

---

 src/gallium/drivers/nv50/nv50_pc.c          |    2 +
 src/gallium/drivers/nv50/nv50_pc.h          |    3 +
 src/gallium/drivers/nv50/nv50_pc_emit.c     |   28 ++++++++++---
 src/gallium/drivers/nv50/nv50_pc_optimize.c |   19 ++++++---
 src/gallium/drivers/nv50/nv50_pc_print.c    |    3 +
 src/gallium/drivers/nv50/nv50_program.c     |    7 +++
 src/gallium/drivers/nv50/nv50_program.h     |    1 +
 src/gallium/drivers/nv50/nv50_screen.c      |   25 ++++++++++--
 src/gallium/drivers/nv50/nv50_screen.h      |    3 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  |   54 +++++++++++++++++++++++++--
 10 files changed, 122 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index c54f16e..637b3cf 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
    nv_print_program(pc);
 #endif
 
+   pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE;
+
    /* optimization */
    ret = nv_pc_exec_pass0(pc);
    if (ret)
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index d9cc775..ba32ab0 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -345,6 +345,9 @@ struct nv_pc {
 
    struct nv_fixup *fixups;
    int num_fixups;
+
+   /* optimization enables */
+   boolean opt_reload_elim;
 };
 
 void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index bb0a6f3..8c64b19 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask)
 }
 
 static void
-set_ld_st_size(struct nv_pc *pc, ubyte type)
+set_ld_st_size(struct nv_pc *pc, int s, ubyte type)
 {
    switch (type) {
    case NV_TYPE_F64:
-      pc->emit[1] |= 0x8000;
+      pc->emit[1] |= 0x8000 << s;
       break;
    case NV_TYPE_F32:
    case NV_TYPE_S32:
    case NV_TYPE_U32:
-      pc->emit[1] |= 0xc000;
+      pc->emit[1] |= 0xc000 << s;
       break;
    case NV_TYPE_S16:
-      pc->emit[1] |= 0x6000;
+      pc->emit[1] |= 0x6000 << s;
       break;
    case NV_TYPE_U16:
-      pc->emit[1] |= 0x4000;
+      pc->emit[1] |= 0x4000 << s;
       break;
    case NV_TYPE_S8:
-      pc->emit[1] |= 0x2000;
+      pc->emit[1] |= 0x2000 << s;
       break;
    default:
       break;
@@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
    if (sf == NV_FILE_MEM_L) {
       pc->emit[0] = 0xd0000001;
       pc->emit[1] = 0x40000000;
+
+      set_addr(pc, i);
    } else {
       NOUVEAU_ERR("invalid ld source file\n");
       abort();
    }
 
-   set_ld_st_size(pc, STYPE(i, 0));
+   set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0));
 
    set_dst(pc, i->def[0]);
    set_pred_wr(pc, i);
@@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
 static void
 emit_st(struct nv_pc *pc, struct nv_instruction *i)
 {
+   assert(SFILE(i, 1) == NV_FILE_GPR);
+   assert(SFILE(i, 0) == NV_FILE_MEM_L);
+
+   pc->emit[0] = 0xd0000001;
+   pc->emit[1] = 0x60000000;
 
+   SID(pc, i->src[1], 2);
+   SID(pc, i->src[0], 9);
+
+   set_ld_st_size(pc, 8, STYPE(i, 1));
+
+   set_addr(pc, i);
+   set_pred(pc, i);
 }
 
 static int
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4f5bdc1..09d232a 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a,
 static INLINE boolean
 inst_cullable(struct nv_instruction *nvi)
 {
+   if (nvi->opcode == NV_OP_STA)
+      return FALSE;
    return (!(nvi->is_terminator || nvi->is_join ||
              nvi->target ||
              nvi->fixed ||
@@ -739,6 +741,7 @@ struct nv_pass_reld_elim {
    int alloc;
 };
 
+/* TODO: properly handle loads from l[] memory in the presence of stores */
 static int
 nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
 {
@@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
    if (ret)
       return ret;
 
-   reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
-   reldelim->pc = pc;
-   pc->pass_seq++;
-   ret = nv_pass_reload_elim(reldelim, root);
-   FREE(reldelim);
-   if (ret)
-      return ret;
+   if (pc->opt_reload_elim) {
+      reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
+      reldelim->pc = pc;
+      pc->pass_seq++;
+      ret = nv_pass_reload_elim(reldelim, root);
+      FREE(reldelim);
+      if (ret)
+         return ret;
+   }
 
    pc->pass_seq++;
    ret = nv_pass_cse(&pass, root);
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 01a6f00..74c3970 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
    case NV_FILE_FLAGS:
       PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
       break;
+   case NV_FILE_MEM_L:
+      nv_print_address('l', -1, ind, 4 * nv_value_id(value));
+      break;
    case NV_FILE_MEM_S:
       nv_print_address('s', -1, ind, 4 * nv_value_id(value));
       break;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 9250287..24952f7 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti,
           inst->Src[0].Register.File == TGSI_FILE_INPUT &&
           dst->Index == ti->edgeflag_out)
          ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+   } else
+   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+      if (inst->Dst[0].Register.Indirect)
+         ti->store_to_memory = TRUE;
    }
 
    for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
       src = &inst->Src[s].Register;
+      if (src->File == TGSI_FILE_TEMPORARY)
+         if (inst->Src[s].Register.Indirect)
+            ti->store_to_memory = TRUE;
       if (src->File != TGSI_FILE_INPUT)
          continue;
       mask = nv50_tgsi_src_mask(inst, s);
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 918baf3..a1b2bde 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -116,6 +116,7 @@ struct nv50_translation_info {
    int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
    boolean indirect_inputs;
    boolean indirect_outputs;
+   boolean store_to_memory;
    struct tgsi_shader_info scan;
    uint32_t *immd32;
    unsigned immd32_nr;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c1efa44..24a6d80 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	uint64_t value;
 	unsigned chipset = dev->chipset;
 	unsigned tesla_class = 0;
-	unsigned stack_size;
+	unsigned stack_size, local_size, max_warps;
 	int ret, i;
 	const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
 
@@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	/* shader stack */
 	nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
 
-	stack_size  = util_bitcount(value & 0xffff);
-	stack_size *= util_bitcount((value >> 24) & 0xf);
-	stack_size *= 32 * 64 * 8;
+	max_warps  = util_bitcount(value & 0xffff);
+	max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+
+	stack_size = max_warps * 64 * 8;
 
 	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
 			     stack_size, &screen->stack_bo);
@@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 4);
 
+	local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32;
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+			     local_size, &screen->local_bo);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+
+	BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, util_unsigned_logbase2(local_size / 8));
+
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
 		BEGIN_RING(chan, screen->tesla,
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 1517f56..ad6bdeb 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -25,7 +25,8 @@ struct nv50_screen {
 	struct nouveau_bo *tic;
 	struct nouveau_bo *tsc;
 
-	struct nouveau_bo *stack_bo;
+	struct nouveau_bo *stack_bo; /* control flow stack */
+	struct nouveau_bo *local_bo; /* l[] memory */
 
 	boolean force_push;
 };
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 983fcb2..f4fee4e 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode,
    return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
 }
 
+static void
+bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
+               struct nv_value *val)
+{
+   struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA);
+   struct nv_value *loc;
+
+   loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+   loc->reg.id = ofst * 4;
+
+   nv_reference(bld->pc, &insn->src[0], loc);
+   nv_reference(bld->pc, &insn->src[1], val);
+   nv_reference(bld->pc, &insn->src[4], ptr);
+}
+
+static struct nv_value *
+bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
+{
+   struct nv_value *loc, *val;
+
+   loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+   loc->reg.id = ofst * 4;
+
+   val = bld_insn_1(bld, NV_OP_LDA, loc);
+
+   nv_reference(bld->pc, &val->insn->src[4], ptr);
+
+   return val;
+}
+
 #define BLD_INSN_1_EX(d, op, dt, s0, s0t)           \
    do {                                             \
       (d) = bld_insn_1(bld, (NV_OP_##op), (s0));    \
@@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode)
 
 static void
 emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
-	   unsigned chan, struct nv_value *value)
+           unsigned chan, struct nv_value *value)
 {
+   struct nv_value *ptr;
    const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 
+   if (reg->Register.Indirect) {
+      ptr = FETCH_ADDR(reg->Indirect.Index,
+                       tgsi_util_get_src_register_swizzle(&reg->Indirect, 0));
+   } else {
+      ptr = NULL;
+   }
+
    assert(chan < 4);
 
    if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
@@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
       value->reg.file = NV_FILE_GPR;
       if (value->insn->bb != bld->pc->current_block)
          value = bld_insn_1(bld, NV_OP_MOV, value);
-      STORE_TEMP(reg->Register.Index, chan, value);
+
+      if (bld->ti->store_to_memory)
+         bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
+      else
+         STORE_TEMP(reg->Register.Index, chan, value);
       break;
    case TGSI_FILE_ADDRESS:
       assert(reg->Register.Index < BLD_MAX_ADDRS);
@@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
       break;
    case TGSI_FILE_TEMPORARY:
-      /* this should be load from l[], with reload elimination later on */
-      res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
+      if (bld->ti->store_to_memory)
+         res = bld_lmem_load(bld, ptr, idx * 4 + swz);
+      else
+         res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
       break;
    case TGSI_FILE_ADDRESS:
       res = bld_fetch_global(bld, &bld->avs[idx][swz]);




More information about the mesa-commit mailing list