[Mesa-dev] [PATCH 4/4] softpipe: add tgsi_exec support for new bit manipulation opcodes

Fri Apr 25 16:45:41 PDT 2014

On Fri, Apr 25, 2014 at 5:44 PM, Roland Scheidegger <sroland at vmware.com> wrote:
> Am 25.04.2014 19:41, schrieb Ilia Mirkin:
>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>> ---
>>  src/gallium/auxiliary/tgsi/tgsi_exec.c | 188 +++++++++++++++++++++++++++++++++
>>  src/gallium/auxiliary/util/u_math.h    |  11 ++
>>  2 files changed, 199 insertions(+)
>>
>> diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
>> index 55da60a..2cc7884 100644
>> --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
>> +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
>> @@ -2603,6 +2603,40 @@ exec_vector_trinary(struct tgsi_exec_machine *mach,
>>     }
>>  }
>>
>> +typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
>> +                                     const union tgsi_exec_channel *src0,
>> +                                     const union tgsi_exec_channel *src1,
>> +                                     const union tgsi_exec_channel *src2,
>> +                                     const union tgsi_exec_channel *src3);
>> +
>> +static void
>> +exec_vector_quaternary(struct tgsi_exec_machine *mach,
>> +                       const struct tgsi_full_instruction *inst,
>> +                       micro_quaternary_op op,
>> +                       enum tgsi_exec_datatype dst_datatype,
>> +                       enum tgsi_exec_datatype src_datatype)
>> +{
>> +   unsigned int chan;
>> +   struct tgsi_exec_vector dst;
>> +
>> +   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
>> +      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
>> +         union tgsi_exec_channel src[4];
>> +
>> +         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
>> +         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
>> +         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
>> +         fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
>> +         op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
>> +      }
>> +   }
>> +   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
>> +      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
>> +         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
>> +      }
>> +   }
>> +}
>> +
>>  static void
>>  exec_dp3(struct tgsi_exec_machine *mach,
>>           const struct tgsi_full_instruction *inst)
>> @@ -3571,6 +3605,135 @@ micro_ucmp(union tgsi_exec_channel *dst,
>>  }
>>
>>  static void
>> +micro_ibfe(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src0,
>> +           const union tgsi_exec_channel *src1,
>> +           const union tgsi_exec_channel *src2)
>> +{
>> +   int i;
>> +   for (i = 0; i < 4; i++) {
>> +      int width = src2->i[i] & 0x1f;
>> +      int offset = src1->i[i] & 0x1f;
>> +      if (width == 0)
>> +         dst->i[i] = 0;
>> +      else if (width + offset < 32)
>> +         dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
>> +      else
>> +         dst->i[i] = src0->i[i] >> offset;
>> +   }
>> +}
>> +
>> +static void
>> +micro_ubfe(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src0,
>> +           const union tgsi_exec_channel *src1,
>> +           const union tgsi_exec_channel *src2)
>> +{
>> +   int i;
>> +   for (i = 0; i < 4; i++) {
>> +      int width = src2->u[i] & 0x1f;
>> +      int offset = src1->u[i] & 0x1f;
>> +      if (width == 0)
>> +         dst->u[i] = 0;
>> +      else if (width + offset < 32)
>> +         dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
>> +      else
>> +         dst->u[i] = src0->u[i] >> offset;
>> +   }
>> +}
>> +
>> +static void
>> +micro_bfi(union tgsi_exec_channel *dst,
>> +          const union tgsi_exec_channel *src0,
>> +          const union tgsi_exec_channel *src1,
>> +          const union tgsi_exec_channel *src2,
>> +          const union tgsi_exec_channel *src3)
>> +{
>> +   int i;
>> +   for (i = 0; i < 4; i++) {
>> +      int width = src3->u[i] & 0x1f;
>> +      int offset = src2->u[i] & 0x1f;
>> +      int bitmask = ((1 << width) - 1) << offset;
>> +      dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
>> +   }
>> +}
>> +
>> +static void
>> +micro_brev(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src)
>> +{
>> +   int i;
>> +   static const unsigned reverse[16] = {
>> +      [0x0] = 0x0,
>> +      [0x1] = 0x8,
>> +      [0x2] = 0x4,
>> +      [0x3] = 0xc,
>> +      [0x4] = 0x2,
>> +      [0x5] = 0xa,
>> +      [0x6] = 0x6,
>> +      [0x7] = 0xe,
>> +      [0x8] = 0x1,
>> +      [0x9] = 0x9,
>> +      [0xa] = 0x5,
>> +      [0xb] = 0xd,
>> +      [0xc] = 0x3,
>> +      [0xd] = 0xb,
>> +      [0xe] = 0x7,
>> +      [0xf] = 0xf,
>> +   };
>> +   for (i = 0; i < 4; i++) {
>> +      dst->u[i] = (reverse[(src->u[i] >> 0) & 0xf] << 28 |
>> +                   reverse[(src->u[i] >> 4) & 0xf] << 24 |
>> +                   reverse[(src->u[i] >> 8) & 0xf] << 20 |
>> +                   reverse[(src->u[i] >> 12) & 0xf] << 16 |
>> +                   reverse[(src->u[i] >> 16) & 0xf] << 12 |
>> +                   reverse[(src->u[i] >> 20) & 0xf] << 8 |
>> +                   reverse[(src->u[i] >> 24) & 0xf] << 4 |
>> +                   reverse[(src->u[i] >> 28) & 0xf] << 0);
>> +   }
>> +}
> Hmm looks like that opcode is slow even for softpipe's standards

I also can't imagine any uses for it (otherwise perhaps modern CPUs
would have a way of doing this that's a little more direct). Oh well,
it's part of ARB_gs5. I guess we could lower it into lots of other
opcodes out of spite, but I think both radeon/nvc0 have explicit
instructions for it, so might as well pipe it through.

> (luckily llvmpipe should be able to exploit pshufb for a rather nice
> implementation, though it will require ssse3...).
> I think though something like
> uint32_t reverse(uint32_t x)
> {
>     x = ((x >> 1) & 0x55555555u) | ((x & 0x55555555u) << 1);
>     x = ((x >> 2) & 0x33333333u) | ((x & 0x33333333u) << 2);
>     x = ((x >> 4) & 0x0f0f0f0fu) | ((x & 0x0f0f0f0fu) << 4);
>     x = ((x >> 8) & 0x00ff00ffu) | ((x & 0x00ff00ffu) << 8);
>     x = ((x >> 16) & 0xffffu) | ((x & 0xffffu) << 16);
>     return x;
> }
>
> blatantly copied from
> http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
> should be faster. But whatever works...

That's pretty clever. I actually wonder if that's faster than the
lookup method, since this isn't really superscalar-friendly (each step
depends on the previous step). But it's neat, I copied it into
util_bitreverse().

>
>
>> +
>> +static void
>> +micro_popc(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src)
>> +{
>> +   dst->u[0] = util_bitcount(src->u[0]);
>> +   dst->u[1] = util_bitcount(src->u[1]);
>> +   dst->u[2] = util_bitcount(src->u[2]);
>> +   dst->u[3] = util_bitcount(src->u[3]);
>> +}
>> +
>> +static void
>> +micro_lsb(union tgsi_exec_channel *dst,
>> +          const union tgsi_exec_channel *src)
>> +{
>> +   dst->i[0] = ffs(src->u[0]) - 1;
>> +   dst->i[1] = ffs(src->u[1]) - 1;
>> +   dst->i[2] = ffs(src->u[2]) - 1;
>> +   dst->i[3] = ffs(src->u[3]) - 1;
>> +}
>> +
>> +static void
>> +micro_imsb(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src)
>> +{
>> +   dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
>> +   dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
>> +   dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
>> +   dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
>> +}
>> +
>> +static void
>> +micro_umsb(union tgsi_exec_channel *dst,
>> +           const union tgsi_exec_channel *src)
>> +{
>> +   dst->i[0] = util_last_bit(src->u[0]) - 1;
>> +   dst->i[1] = util_last_bit(src->u[1]) - 1;
>> +   dst->i[2] = util_last_bit(src->u[2]) - 1;
>> +   dst->i[3] = util_last_bit(src->u[3]) - 1;
>> +}
>> +
>> +static void
>>  exec_instruction(
>>     struct tgsi_exec_machine *mach,
>>     const struct tgsi_full_instruction *inst,
>> @@ -4417,6 +4580,31 @@ exec_instruction(
>>        /* src[2] = sampler unit */
>>        exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
>>        break;
>> +
>> +   case TGSI_OPCODE_IBFE:
>> +      exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
>> +      break;
>> +   case TGSI_OPCODE_UBFE:
>> +      exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
>> +      break;
>> +   case TGSI_OPCODE_BFI:
>> +      exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
>> +      break;
>> +   case TGSI_OPCODE_BREV:
>> +      exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
>> +      break;
>> +   case TGSI_OPCODE_POPC:
>> +      exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
>> +      break;
>> +   case TGSI_OPCODE_LSB:
>> +      exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
>> +      break;
>> +   case TGSI_OPCODE_IMSB:
>> +      exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
>> +      break;
>> +   case TGSI_OPCODE_UMSB:
>> +      exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
>> +      break;
>>     default:
>>        assert( 0 );
>>     }
>> diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
>> index ec03e4e..5b811e3 100644
>> --- a/src/gallium/auxiliary/util/u_math.h
>> +++ b/src/gallium/auxiliary/util/u_math.h
>> @@ -567,6 +567,17 @@ static INLINE unsigned util_last_bit(unsigned u)
>>  #endif
>>  }
>>
>> +static INLINE unsigned util_last_bit_signed(int i)
> Similar to my previous comment, I find that name confusing. Maybe
> last_significant_bit_signed or something like that? No biggie though.

It parallels the naming of util_last_bit(). Perhaps that one should
also be last_significant_bit, in which case I'd be happy to rename
this function too. (I believe the current naming reflects 'fls' --
'find last set'.)

> Otherwise looks good to me.
>
> Roland
>
>> +{
>> +#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407)
>> +   return 31 - __builtin_clrsb(i);
>> +#else
>> +   if (i >= 0)
>> +      return util_last_bit(i);
>> +   else
>> +      return util_last_bit(~(unsigned)i);
>> +#endif
>> +}
>>
>>  /* Destructively loop over all of the bits in a mask as in:
>>   *
>>