[Mesa-dev] [PATCH 10/11] clover: Add function for building a clover::module for non-TGSI targets

Sat May 12 15:40:43 PDT 2012

Tom Stellard <tstellar at gmail.com> writes:

> ---
>  .../state_trackers/clover/llvm/invocation.cpp      |  174 +++++++++++++++++++-
>  1 files changed, 165 insertions(+), 9 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> index 89e21bf..b31fddc 100644
> --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
> +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> @@ -22,24 +22,33 @@
>  
>  #include "core/compiler.hpp"
>  
> -#if 0
>  #include <clang/Frontend/CompilerInstance.h>
>  #include <clang/Frontend/TextDiagnosticPrinter.h>
>  #include <clang/CodeGen/CodeGenAction.h>
> +#include <llvm/Bitcode/BitstreamWriter.h>
> +#include <llvm/Bitcode/ReaderWriter.h>
> +#include <llvm/DerivedTypes.h>
> +#include <llvm/Linker.h>
>  #include <llvm/LLVMContext.h>
> +#include <llvm/Module.h>
> +#include <llvm/PassManager.h>
>  #include <llvm/Support/TargetSelect.h>
>  #include <llvm/Support/MemoryBuffer.h>
> +#include <llvm/Support/PathV1.h>
> +#include <llvm/Target/TargetData.h>
> +#include <llvm/Transforms/IPO/PassManagerBuilder.h>
> +
> +#include "util/u_memory.h"
>  
>  #include <iostream>
>  #include <iomanip>
>  #include <fstream>
>  #include <cstdio>
> -#endif
>  
>  using namespace clover;
>  
> -#if 0
>  namespace {
> +#if 0
>     void
>     build_binary(const std::string &source, const std::string &target,
>                  const std::string &name) {
> @@ -78,17 +87,164 @@ namespace {
>        compat::istream cs(str);
>        return module::deserialize(cs);
>     }
> -}
>  #endif
> +   module
> +   build_module_llvm(const std::string &source, const std::string &target,
> +                     const std::string &name) {
> +
> +      /* Compile the kernel */
> +      clang::CompilerInstance c;
> +      module m;
> +      clang::EmitLLVMOnlyAction act(&llvm::getGlobalContext());
> +      std::string log;
> +      std::string target_triple = target + "--";
> +      llvm::raw_string_ostream s_log(log);
> +
> +#if HAVE_LLVM <= 0x0300
> +      c.getFrontendOpts().Inputs.push_back(
> +         std::make_pair(clang::IK_OpenCL, "cl_input"));
> +#else
> +      c.getFrontendOpts().Inputs.push_back(
> +         clang::FrontendInputFile("cl_input", clang::IK_OpenCL));
> +#endif
> +      c.getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly;
> +      c.getHeaderSearchOpts().UseBuiltinIncludes = true;
> +#if HAVE_LLVM < 0x0300
> +      c.getHeaderSearchOpts().UseStandardIncludes = true;
> +#else
> +      c.getHeaderSearchOpts().UseStandardSystemIncludes = true;
> +#endif
> +      c.getHeaderSearchOpts().ResourceDir = CLANG_RESOURCE_DIR;
> +
> +      /* Add libclc generic search path */
> +      c.getHeaderSearchOpts().AddPath(LIBCLC_PATH "/generic/include/",
> +                                      clang::frontend::Angled,
> +                                      false, false, false);
> +
> +      /* Add libclc target specific search path */
> +      c.getHeaderSearchOpts().AddPath(LIBCLC_PATH + target + "/include/",
> +                                      clang::frontend::Angled,
> +                                      false, false, false);
> +
> +      /* Add libclc include */
> +      c.getPreprocessorOpts().Includes.push_back("clc/clc.h");
> +      /* clc.h requires that this macro be defined: */
> +      c.getPreprocessorOpts().addMacroDef("cl_clang_storage_class_specifiers");
> +
> +      c.getLangOpts().NoBuiltin = true;
> +      c.getTargetOpts().Triple = target_triple;
> +      c.getInvocation().setLangDefaults(clang::IK_OpenCL);
> +      c.createDiagnostics(0, NULL, new clang::TextDiagnosticPrinter(
> +                             s_log, c.getDiagnosticOpts()));
> +
> +      c.getPreprocessorOpts().addRemappedFile(
> +         "cl_input", llvm::MemoryBuffer::getMemBuffer(source));
> +
> +      /* Compile the code */
> +      if (!c.ExecuteAction(act))
> +         throw build_error(log);
> +
> +      /* Link the kernel with libclc */
> +      llvm::PassManager PM;
> +      llvm::PassManagerBuilder Builder;
> +      bool isNative;
> +      llvm::Module * mod = act.takeModule();
> +      llvm::Linker linker("clover", mod);
> +
> +      linker.LinkInFile(llvm::sys::Path(LIBCLC_PATH + target_triple + "/lib/builtins.bc"), isNative);
> +      mod = linker.releaseModule();
> +
> +      /* Run link time optimizations */
> +      Builder.populateLTOPassManager(PM, false, true);
> +      Builder.OptLevel = 2;
> +      PM.run(*mod);
> +
> +      /* Build the clover::module */
> +      unsigned char * prog;
> +      uint32_t prog_sz;
> +
> +#if HAVE_LLVM > 0x0300
> +      llvm::SmallVector<char, 1024> llvm_bitcode;
> +      llvm::raw_svector_ostream bitcode_ostream(llvm_bitcode);
> +#else
> +      std::vector<unsigned char> llvm_bitcode;
> +#endif
> +      llvm::BitstreamWriter writer(llvm_bitcode);
> +
> +#if HAVE_LLVM <= 0x0300
> +      llvm::WriteBitcodeToStream(mod, writer);
> +#else
> +      llvm::WriteBitcodeToFile(mod, bitcode_ostream);
> +      bitcode_ostream.flush();
> +#endif
> +
> +      prog_sz = llvm_bitcode.size() * sizeof(unsigned char);
> +
> +      /* We need to add 4 to the program size, because we will
> +       * be preprending the length of the program to the bitcode string. */
> +      prog = (unsigned char *)MALLOC(prog_sz + 4);
> +      ((uint32_t *)prog)[0] = prog_sz;
> +      memcpy(prog + 4, &llvm_bitcode[0], prog_sz);
> +
> +      std::string kernel_name;
> +      compat::vector<module::argument> args;
> +      const llvm::NamedMDNode * kernel_node =
> +                                    mod->getNamedMetadata("opencl.kernels");
> +      /* XXX: Support more than one kernel */
> +      /* XXX: Error if there are no kernels */
> +      assert(kernel_node->getNumOperands() == 1);
> +
> +      llvm::Function * kernel_func = llvm::dyn_cast<llvm::Function>(
> +                                      kernel_node->getOperand(0)->getOperand(0));
> +      kernel_name = kernel_func->getName();
> +
> +      for (llvm::Function::arg_iterator I = kernel_func->arg_begin(),
> +                                      E = kernel_func->arg_end(); I != E; ++I) {
> +         llvm::Argument & arg = *I;
> +         llvm::Type * arg_type = arg.getType();
> +         llvm::TargetData TD(kernel_func->getParent());
> +         unsigned arg_size = TD.getTypeStoreSize(arg_type);
> +
> +         if (llvm::isa<llvm::PointerType>(arg_type) and arg.hasByValAttr()) {
> +            arg_type =
> +               llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType();
> +         }
> +
> +         if (arg_type->isPointerTy()) {
> +            /* XXX: Figure out LLVM->OpenCL address space mappings for each
> +             * target.  I think we need to ask clang what these are.  For now,
> +             * pretend everything is in the global address space. */
> +            unsigned address_space = llvm::cast<llvm::PointerType>(arg_type)->getAddressSpace();
> +            switch (address_space) {
> +               default:
> +                  args.push_back(module::argument(module::argument::global, arg_size));
> +                  break;
> +            }
> +         } else {
> +            args.push_back(module::argument(module::argument::scalar, arg_size));
> +         }
> +      }
> +      m.syms.push_back(module::symbol(kernel_name, 0, 0, args ));
> +      m.secs.push_back(module::section(0, module::section::text, prog_sz + 4,
> +                       compat::vector<char>((char *)prog, prog_sz + 4)));
> +      return m;
> +   }
> +}
>  
>  module
>  clover::compile_program_llvm(const compat::string &source,
>                               const compat::string &target) {
> +
> +   if (target == compat::string("TGSI")) {
>  #if 0
> -   build_binary(source, target, "cl_input");
> -   module m = load_binary("cl_input.o");
> -   std::remove("cl_input.o");
> -   return m;
> +      build_binary(source, target, "cl_input");
> +      module m = load_binary("cl_input.o");
> +      std::remove("cl_input.o");
> +      return m;
> +#else
> +      return module();
> +   } else {
> +      return build_module_llvm(source, target, "cl_input");
> +   }
>  #endif
> -   return module();
>  }

Hi Tom,

I'd really like to see this done in a different way.  IIUC, with this,
LLVM is invoked twice, the first time here to generate some
R600-specific LLVM bytecode, and then again in the pipe driver to run it
through the R600 back-end and generate the actual machine code.  In
particular this means that:

 - The state tracker needs a special case for radeon in order to know
   that the compilation process has to be terminated prematurely before
   it gets to generate the machine code.

 - I'm not sure you can take advantage of clCreateProgramWithBinary()
   because the state tracker doesn't have access to the final machine
   code (this is probably going to be a minor problem in practice
   though).

I can think of two different ways this could work (your solution would
be somewhere in between):

 - The r600g LLVM back-end could support some well-defined output object
   format (e.g. the one implemented by the clover::module classes), like
   most of the other LLVM back-ends.  You probably want to do this
   anyway if you want to be able to compile CL programs off-line.  If
   you do it this way, the state tracker will just call clang to
   completion using the appropriate target and pass the generated
   machine code to the pipe driver.

   If you think supporting different hardware versions with different
   ISAs would be a problem under this scheme, we could have another
   compute cap that would determine a specific variant of the ISA.

 - Another option would be to forget about driver-specific IRs in the
   compute API.  The pipe driver would have the choice between TGSI and
   LLVM, if it wants LLVM, the state tracker would do roughly what
   you're doing here using some sort of "identity" LLVM target that
   would do nothing but describing the peculiarities of the hardware
   (e.g. endianness, widths of the supported primitive types), which in
   turn would be queried from the pipe driver using compute caps.

Personally I think the latter would be closer to ideal, but I guess it
would also involve more work...
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 229 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20120513/383b9742/attachment.pgp>