<div>Replace i2f() in r600_blit_kms.c with an optimized version.</div><div><br></div><div>We use __fls() to find the most significant bit. Using that, the</div><div>loop can be avoided. A second trick is to use the mod(32)</div>
<div>behaviour of the rotate instructions on x86 to expand the range</div><div>of the unsigned int to float conversion to the full 32 bits.</div><div><br></div><div>The routine is now exact up to 2^24. Above that, we truncate which</div>
<div>is equivalent to rounding towards zero.</div><div><br></div><div>Signed-off-by: Steven Fuerst <<a href="mailto:svfuerst@gmail.com">svfuerst@gmail.com</a>></div><div>---</div><div> drivers/gpu/drm/radeon/r600_blit_kms.c | 53 ++++++++++++++------------------</div>
<div> 1 file changed, 23 insertions(+), 30 deletions(-)</div><div><br></div><div>diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c</div><div>index 2bef854..8307558 100644</div><div>
--- a/drivers/gpu/drm/radeon/r600_blit_kms.c</div><div>+++ b/drivers/gpu/drm/radeon/r600_blit_kms.c</div><div>@@ -455,44 +455,37 @@ set_default_state(struct radeon_device *rdev)</div><div> <span class="Apple-tab-span" style="white-space:pre"> </span>radeon_ring_write(ring, sq_stack_resource_mgmt_2);</div>
<div> }</div><div> </div><div>-#define I2F_MAX_BITS 15</div><div>-#define I2F_MAX_INPUT ((1 << I2F_MAX_BITS) - 1)</div><div>-#define I2F_SHIFT (24 - I2F_MAX_BITS)</div><div>+/* 23 bits of float fractional data */</div>
<div>+#define I2F_FRAC_BITS<span class="Apple-tab-span" style="white-space:pre"> </span>23</div><div>+#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1)</div><div> </div><div> /*</div><div> * Converts unsigned integer into 32-bit IEEE floating point representation.</div>
<div>- * Conversion is not universal and only works for the range from 0</div><div>- * to 2^I2F_MAX_BITS-1. Currently we only use it with inputs between</div><div>- * 0 and 16384 (inclusive), so I2F_MAX_BITS=15 is enough. If necessary,</div>
<div>- * I2F_MAX_BITS can be increased, but that will add to the loop iterations</div><div>- * and slow us down. Conversion is done by shifting the input and counting</div><div>- * down until the first 1 reaches bit position 23. The resulting counter</div>
<div>- * and the shifted input are, respectively, the exponent and the fraction.</div><div>- * The sign is always zero.</div><div>+ * Will be exact from 0 to 2^24. Above that, we round towards zero</div><div>+ * as the fractional bits will not fit in a float. (It would be better to</div>
<div>+ * round towards even as the fpu does, but that is slower.)</div><div>+ * This routine depends on the mod(32) behaviour of the rotate instructions</div><div>+ * on x86.</div><div> */</div><div>-static uint32_t i2f(uint32_t input)</div>
<div>+static uint32_t i2f(uint32_t x)</div><div> {</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>u32 result, i, exponent, fraction;</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>uint32_t msb, exponent, fraction;</div>
<div> </div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>WARN_ON_ONCE(input > I2F_MAX_INPUT);</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>/* Zero is special */</div><div>
+<span class="Apple-tab-span" style="white-space:pre"> </span>if (!x) return 0;</div><div> </div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>if ((input & I2F_MAX_INPUT) == 0)</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>result = 0;</div>
<div>-<span class="Apple-tab-span" style="white-space:pre"> </span>else {</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>exponent = 126 + I2F_MAX_BITS;</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>fraction = (input & I2F_MAX_INPUT) << I2F_SHIFT;</div>
<div>+<span class="Apple-tab-span" style="white-space:pre"> </span>/* Get location of the most significant bit */</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>msb = __fls(x);</div><div> </div><div>
-<span class="Apple-tab-span" style="white-space:pre"> </span>for (i = 0; i < I2F_MAX_BITS; i++) {</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>if (fraction & 0x800000)</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>break;</div>
<div>-<span class="Apple-tab-span" style="white-space:pre"> </span>else {</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>fraction = fraction << 1;</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>exponent = exponent - 1;</div>
<div>-<span class="Apple-tab-span" style="white-space:pre"> </span>}</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>}</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>result = exponent << 23 | (fraction & 0x7fffff);</div>
<div>-<span class="Apple-tab-span" style="white-space:pre"> </span>}</div><div>-<span class="Apple-tab-span" style="white-space:pre"> </span>return result;</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>/*</div>
<div>+<span class="Apple-tab-span" style="white-space:pre"> </span> * Use a rotate instead of a shift because that works both leftwards</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span> * and rightwards due to the mod(32) beahviour. This means we don't</div>
<div>+<span class="Apple-tab-span" style="white-space:pre"> </span> * need to check to see if we are above 2^24 or not.</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span> */</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>fraction = ror32(x, msb - I2F_FRAC_BITS) & I2F_MASK;</div>
<div>+<span class="Apple-tab-span" style="white-space:pre"> </span>exponent = (127 + msb) << I2F_FRAC_BITS;</div><div>+</div><div>+<span class="Apple-tab-span" style="white-space:pre"> </span>return fraction + exponent;</div>
<div> }</div><div> </div><div> int r600_blit_init(struct radeon_device *rdev)</div>