[PATCH] nv50: better insn generation

Christoph Bumiller e0425955 at student.tuwien.ac.at
Sun Jun 21 10:07:09 PDT 2009


Don't use extra TEMPs unnecessarily in some cases.
---
 src/gallium/drivers/nv50/nv50_program.c |  120 +++++++++++++++---------------
 1 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d7ab28a..5594560 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -1294,18 +1294,20 @@ static boolean
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
 	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
-	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
-	unsigned mask, sat, unit;
+	struct nv50_reg *rdst[4], *dst[4], *src[3][4];
+	struct nv50_reg **pp_rtmp, *rtmp = NULL, *temp = NULL;
+	unsigned mask, sat, unit = 0;
 	boolean assimilate = FALSE;
-	int i, c;
+	int i, c, nr_dst = 0;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
 	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
+		if (mask & (1 << c)) {
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
-		else
+			++nr_dst;
+		} else
 			dst[c] = NULL;
 		rdst[c] = NULL;
 		src[0][c] = NULL;
@@ -1313,8 +1315,13 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		src[2][c] = NULL;
 	}
 
+	pp_rtmp = &dst[ffs(mask) - 1];
+	if (*pp_rtmp && (*pp_rtmp)->type != P_TEMP && (nr_dst > 1 || sat))
+		pp_rtmp = &temp;
+
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		const struct tgsi_full_src_register *fs =
+			&inst->FullSrcRegisters[i];
 
 		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
 			unit = fs->SrcRegister.Index;
@@ -1327,10 +1334,15 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			rdst[c] = dst[c];
-			dst[c] = temp_temp(pc);
+			if (dst[c] && dst[c]->type != P_TEMP)
+				dst[c] = temp_temp(pc);
 		}
-	} else
-	if (direct2dest_op(inst)) {
+	}
+
+	if (direct2dest_op(inst) && (*pp_rtmp)) {
+		/* We really don't lose the real dst as we do not
+		 * get here if sat overwrites dst with temp.
+		 */
 		for (c = 0; c < 4; c++) {
 			if (!dst[c] || dst[c]->type != P_TEMP)
 				continue;
@@ -1341,7 +1353,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				    dst[c] == src[2][i])
 					break;
 			}
-			if (i == 4)
+			if (i == 4 || !dst[i])
 				continue;
 
 			assimilate = TRUE;
@@ -1367,48 +1379,32 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		break;
 	case TGSI_OPCODE_COS:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 5, rtmp, temp);
 		break;
 	case TGSI_OPCODE_DP3:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, rtmp, src[0][2], src[1][2], temp);
 		break;
 	case TGSI_OPCODE_DP4:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, temp, src[0][3], src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, rtmp, src[0][3], src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DPH:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, temp, src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_add(pc, rtmp, src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DST:
 	{
@@ -1426,13 +1422,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		break;
 	case TGSI_OPCODE_EX2:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 6, rtmp, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1461,13 +1453,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = temp_temp(pc);
-		emit_flop(pc, 3, temp, src[0][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		rtmp = *pp_rtmp;
+		if (!rtmp)
+			rtmp = temp_temp(pc);
+		emit_flop(pc, 3, rtmp, src[0][0]);
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -1523,18 +1512,16 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 0, dst[c], src[0][0]);
-		}
+		rtmp = *pp_rtmp;
+		if (!rtmp)
+			rtmp = temp_temp(pc);
+		emit_flop(pc, 0, rtmp, src[0][0]);
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 2, dst[c], src[0][0]);
-		}
+		rtmp = *pp_rtmp;
+		if (!rtmp)
+			rtmp = temp_temp(pc);
+		emit_flop(pc, 2, rtmp, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
@@ -1557,6 +1544,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		break;
 	case TGSI_OPCODE_SIN:
 		temp = temp_temp(pc);
+		rtmp = *pp_rtmp;
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, 4, temp, temp);
 		for (c = 0; c < 4; c++) {
@@ -1611,14 +1599,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		return FALSE;
 	}
 
+	if (rtmp) {
+		if (sat)
+			dst[0] = dst[1] = dst[2] = dst[3] = rtmp;
+		else {
+			for (c = 0; c < 4; c++) {
+				if (mask & (1 << c))
+					emit_mov(pc, dst[c], rtmp);
+			}
+		}
+	}
+
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
-				 CVT_F32_F32);
+			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT, 0xc4);
 		}
-	} else if (assimilate) {
+	}
+
+	if (assimilate) {
 		for (c = 0; c < 4; c++)
 			if (rdst[c])
 				assimilate_temp(pc, rdst[c], dst[c]);
-- 
1.6.0.6


--------------090503050107050804030002
Content-Type: text/plain;
 name="0012-nv50-initial-support-for-IF-ELSE-ENDIF-insns.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename*0="0012-nv50-initial-support-for-IF-ELSE-ENDIF-insns.patch"



More information about the Nouveau mailing list