[Mesa-dev] [PATCH mesa v2 1/2] nouveau: codegen: Use FILE_MEMORY_BUFFER for buffers

Fri Apr 8 10:17:53 UTC 2016

Hi,

On 23-03-16 23:10, Samuel Pitoiset wrote:
> Are you sure this won't break compute shaders on fermi?
> Could you please double-check that?

I just checked:

lspci:
01:00.0 VGA compatible controller: NVIDIA Corporation GF119 [GeForce GT 610] (rev a1)

Before this patch-set:

[hans at plank piglit]$ ./piglit run -o shader -t '.*arb_shader_storage_buffer_object.*' results/shader
[9/9] pass: 9 /

After this patch-set:

[hans at plank piglit]$ ./piglit run -o shader -t '.*arb_shader_storage_buffer_object.*' results/shader
[9/9] pass: 9 /

> One minor comment below.
>
> On 03/17/2016 05:07 PM, Hans de Goede wrote:
>> Some of the lowering steps we currently do for FILE_MEMORY_GLOBAL only
>> apply to buffers, making it impossible to use FILE_MEMORY_GLOBAL for
>> OpenCL global buffers.
>>
>> This commits changes the buffer code to use FILE_MEMORY_BUFFER at the
>> ir_from_tgsi and lowering steps, freeing use of FILE_MEMORY_GLOBAL
>> for use with OpenCL global buffers.
>>
>> Note that after lowering buffer accesses use the FILE_MEMORY_GLOBAL
>> register file.
>>
>> Tested with piglet on a gk107, before this patch:
>> ./piglit run -o shader -t '.*arb_shader_storage_buffer_object.*' results/shader
>> [9/9] pass: 9 /
>> after:
>> ./piglit run -o shader -t '.*arb_shader_storage_buffer_object.*' results/shader
>> [9/9] pass: 9 /
>>
>> Signed-off-by: Hans de Goede <hdegoede at redhat.com>
>> ---
>> Changes in v2:
>> -New patch in v2 of patch-set to re-enable support for global opencl buffers
>> ---
>>   src/gallium/drivers/nouveau/codegen/nv50_ir.h                 | 1 +
>>   src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp     | 2 +-
>>   src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 8 +++++---
>>   src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp         | 1 +
>>   src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp   | 5 ++++-
>>   src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp   | 1 +
>>   6 files changed, 13 insertions(+), 5 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
>> index 7b0eb2f..5141fc6 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
>> @@ -332,6 +332,7 @@ enum DataFile
>>      FILE_MEMORY_CONST,
>>      FILE_SHADER_INPUT,
>>      FILE_SHADER_OUTPUT,
>> +   FILE_MEMORY_BUFFER,
>>      FILE_MEMORY_GLOBAL,
>>      FILE_MEMORY_SHARED,
>>      FILE_MEMORY_LOCAL,
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> index baa2e30..7ae0cb2 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
>> @@ -373,7 +373,7 @@ static nv50_ir::DataFile translateFile(uint file)
>>      case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
>>      case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
>>      case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
>> -   case TGSI_FILE_BUFFER:          return nv50_ir::FILE_MEMORY_GLOBAL;
>> +   case TGSI_FILE_BUFFER:          return nv50_ir::FILE_MEMORY_BUFFER;
>>      case TGSI_FILE_MEMORY:          return nv50_ir::FILE_MEMORY_GLOBAL;
>>      case TGSI_FILE_SAMPLER:
>>      case TGSI_FILE_NULL:
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index d0936d8..628deb7 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -1141,13 +1141,14 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
>>         handleSharedATOM(atom);
>>         return true;
>>      default:
>> -      assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
>> +      assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
>>         base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
>>         assert(base->reg.size == 8);
>>         if (ptr)
>>            base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
>>         assert(base->reg.size == 8);
>>         atom->setIndirect(0, 0, base);
>> +      atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
>>         return true;
>>      }
>>      base =
>> @@ -1963,7 +1964,7 @@ NVC0LoweringPass::visit(Instruction *i)
>>         } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
>>            assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
>>            i->op = OP_VFETCH;
>> -      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
>> +      } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
>>            Value *ind = i->getIndirect(0, 1);
>>            Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
>>            // XXX come up with a way not to do this for EVERY little access but
>> @@ -1978,6 +1979,7 @@ NVC0LoweringPass::visit(Instruction *i)
>>            }
>>            i->setIndirect(0, 1, NULL);
>>            i->setIndirect(0, 0, ptr);
>> +         i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
>>            bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
>>            i->setPredicate(CC_NOT_P, pred);
>>            if (i->defExists(0)) {
>> @@ -1987,7 +1989,7 @@ NVC0LoweringPass::visit(Instruction *i)
>>         break;
>>      case OP_ATOM:
>>      {
>> -      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
>> +      const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
>>         handleATOM(i);
>>         handleCasExch(i, cctl);
>>      }
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
>> index cfa85ec..870b36e 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
>> @@ -455,6 +455,7 @@ int Symbol::print(char *buf, size_t size,
>>      case FILE_MEMORY_CONST:  c = 'c'; break;
>>      case FILE_SHADER_INPUT:  c = 'a'; break;
>>      case FILE_SHADER_OUTPUT: c = 'o'; break;
>> +   case FILE_MEMORY_BUFFER: c = 'b'; break; // Only used before lowering
>
> Could you please show me the output of NV50_PROG_DEBUG=255 with a test which uses this file type? I'm not sure if using b[] is better than g[] actually.

NV50_PROG_DEBUG=255 bin/arb_shader_storage_buffer_object-rendering

...

MAIN:-1 ()
BB:0 (52 instructions) - df = { }
  -> BB:1 (tree)
   0: mov u32 %r1 0x00000000 (0)
   1: ld  u32 %r0 b[0x0] (0)
   2: presin f32 %r3 %r0 (0)
   3: cos f32 %r3 %r3 (0)
   4: mov u32 %r2 %r3 (0)
   ...

And after the first lowering step:

MAIN:-1 ()
BB:0 (77 instructions) - df = { }
  -> BB:1 (tree)
   0: mov u32 %r56 0x00000000 (0)
   1: ld  u64 %r57d c15[0x300] (0)
   2: mov u32 %r58 0x00000004 (0)
   3: ld  u32 %r59 c15[0x308] (0)
   4: set u8 %p60 gt u32 %r58 %r59 (0)
   5: mov u32 %r61 0x00000000 (0)
   6: not %p60 ld  u32 %r62 g[%r57d+0x0] (0)
   ...

Note how the 'b' printing is only used before the buffer access
is lowered to a global access, so this seems to be the right thing todo.

Regards,

Hans

>
>>      case FILE_MEMORY_GLOBAL: c = 'g'; break;
>>      case FILE_MEMORY_SHARED: c = 's'; break;
>>      case FILE_MEMORY_LOCAL:  c = 'l'; break;
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
>> index 2c4d7f5..2af1715 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
>> @@ -207,6 +207,7 @@ TargetNV50::getFileSize(DataFile file) const
>>      case FILE_MEMORY_CONST:  return 65536;
>>      case FILE_SHADER_INPUT:  return 0x200;
>>      case FILE_SHADER_OUTPUT: return 0x200;
>> +   case FILE_MEMORY_BUFFER: return 0xffffffff;
>>      case FILE_MEMORY_GLOBAL: return 0xffffffff;
>>      case FILE_MEMORY_SHARED: return 16 << 10;
>>      case FILE_MEMORY_LOCAL:  return 48 << 10;
>> @@ -406,7 +407,8 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const
>>      if (ty == TYPE_B96 || ty == TYPE_NONE)
>>         return false;
>>      if (typeSizeof(ty) > 4)
>> -      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
>> +      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL) ||
>> +             (file == FILE_MEMORY_BUFFER);
>>      return true;
>>   }
>>
>> @@ -509,6 +511,7 @@ int TargetNV50::getLatency(const Instruction *i) const
>>         switch (i->src(0).getFile()) {
>>         case FILE_MEMORY_LOCAL:
>>         case FILE_MEMORY_GLOBAL:
>> +      case FILE_MEMORY_BUFFER:
>>            return 100; // really 400 to 800
>>         default:
>>            return 22;
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
>> index a03afa8..9e1e7bf 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
>> @@ -248,6 +248,7 @@ TargetNVC0::getFileSize(DataFile file) const
>>      case FILE_MEMORY_CONST:  return 65536;
>>      case FILE_SHADER_INPUT:  return 0x400;
>>      case FILE_SHADER_OUTPUT: return 0x400;
>> +   case FILE_MEMORY_BUFFER: return 0xffffffff;
>>      case FILE_MEMORY_GLOBAL: return 0xffffffff;
>>      case FILE_MEMORY_SHARED: return 16 << 10;
>>      case FILE_MEMORY_LOCAL:  return 48 << 10;
>>