[Mesa-dev] [PATCH 16/22] intel/tools/BatchbufferLogger: first implementation

Mon Sep 25 10:34:16 UTC 2017

From: Kevin Rogovin <kevin.rogovin at intel.com>

Initial implementation of BatchbufferLogger; a tool
to correlate batchbuffer contents with GL/GLES API
calls without changing what is sent to the GPU.

Signed-off-by: Kevin Rogovin <kevin.rogovin at intel.com>
---
 src/intel/Makefile.tools.am                        |   27 +
 src/intel/tools/i965_batchbuffer_logger.cpp        | 4221 ++++++++++++++++++++
 .../tools/i965_batchbuffer_logger_instructions.h   |  131 +
 3 files changed, 4379 insertions(+)
 create mode 100644 src/intel/tools/i965_batchbuffer_logger.cpp
 create mode 100644 src/intel/tools/i965_batchbuffer_logger_instructions.h

diff --git a/src/intel/Makefile.tools.am b/src/intel/Makefile.tools.am
index 8071220..6e3e497 100644
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -19,10 +19,15 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
+intellibdir = $(libdir)
+
 noinst_PROGRAMS += \
 	tools/aubinator \
 	tools/aubinator_error_decode
 
+intellib_LTLIBRARIES = \
+	tools/libi965_batchbuffer_logger.la
+
 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
 	tools/disasm.c \
@@ -63,3 +68,25 @@ tools_aubinator_error_decode_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(EXPAT_CFLAGS) \
 	$(ZLIB_CFLAGS)
+
+tools_libi965_batchbuffer_logger_la_SOURCES = \
+	tools/i965_batchbuffer_logger.cpp \
+	tools/gen_disasm.h \
+	tools/disasm.c \
+	tools/intel_aub.h
+
+tools_libi965_batchbuffer_logger_la_LIBADD = \
+	common/libintel_common.la \
+	compiler/libintel_compiler.la \
+	$(top_builddir)/src/util/libmesautil.la \
+	$(top_builddir)/src/compiler/nir/libnir.la \
+	isl/libisl.la \
+	$(PTHREAD_LIBS) \
+	$(EXPAT_LIBS) \
+	$(ZLIB_LIBS)
+
+tools_libi965_batchbuffer_logger_la_CXXFLAGS = \
+	$(AM_CXXFLAGS) -std=c++11
+
+tools_libi965_batchbuffer_logger_la_LDFLAGS = \
+	-no-undefined -module -avoid-version -shared -shrext .so
diff --git a/src/intel/tools/i965_batchbuffer_logger.cpp b/src/intel/tools/i965_batchbuffer_logger.cpp
new file mode 100644
index 0000000..6300d18
--- /dev/null
+++ b/src/intel/tools/i965_batchbuffer_logger.cpp
@@ -0,0 +1,4221 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <mutex>
+#include <map>
+#include <vector>
+#include <string>
+#include <list>
+#include <sstream>
+#include <fstream>
+#include <iomanip>
+#include <algorithm>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <typeinfo>
+#include <memory>
+#include <functional>
+
+#include <stdarg.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <pthread.h>
+
+#include "i965_batchbuffer_logger_instructions.h"
+#include "drm-uapi/i915_drm.h"
+#include "common/gen_decoder.h"
+#include "gen_disasm.h"
+#include "compiler/brw_inst.h"
+#include "util/mesa-sha1.h"
+
+#include "tools/i965_batchbuffer_logger_app.h"
+#include "tools/i965_batchbuffer_logger_output.h"
+#include "tools/i965_batchbuffer_logger.h"
+
+/* Basic overview of implementation:
+ *  - BatchbufferLogger is a singleton to allow for calls into it
+ *    without needing the object itself
+ *
+ *  - Using the driver provided function pointer, it "knows" what
+ *    is considered the active batchbuffer
+ *      * NOTE: before being initialized by a driver, the function
+ *        pointer specifying the active batchbuffer returns a value
+ *        indicating that there is no active batchbuffer
+ *
+ *  - BatchbufferLogger has a map keyed by file descriptor of
+ *    GEMBufferTracker objects. A GEMBufferTracker has within it
+ *      * a map keyed by GEM BO handle of GEMBufferObjects
+ *      * a map keyed by GEM BO handle of BatchbufferLog
+ *      * a dummy BatchbufferLog object
+ *
+ *  - A BatchbufferLog object is essentially a log of what
+ *    API calls are made when in a batchbuffer
+ *
+ *  - A BatchbufferLog object is removed when any of the
+ *    following occur:
+ *      * The driver calls aborted_batchbuffer(); upon removal,
+ *        the log is emitted.
+ *      * at drmIoctl, the GEM BO is used as the command buffer in
+ *        an execbuffer ioctl. Upon this ioctl, the associated
+ *        BatchbufferLog is emitted.
+ *
+ *  - A BatchbufferLog object is added to a GEMBufferTracker
+ *    whenever a GEM BO handle not seen before is emitted by
+ *    the function pointer provided by the driver that gives the
+ *    active batchbuffer.
+ *
+ * - Whenever an entry is added to a BatchbufferLog object A,
+ *   if there are any entries in the dummy BatchbufferLog
+ *   those entries are moved to the BatchbufferLog A in a way
+ *   so that when BatchbufferLog is printed to file, the entries
+ *   from dummy come first.
+ *
+ * - The following environmental values control what is written to
+ *   file:
+ *     * I965_DECODE_LEVEL controls the level of batchbuffer decoding
+ *         - no_decode do not decode batchbuffer at all
+ *         - instruction_decode decode instruction name only
+ *         - instruction_details_decode decode instruction contents
+ *     * I965_PRINT_RELOC_LEVEL controls at what level to print reloc data
+ *         - print_reloc_nothing do not print reloc data
+ *         - print_reloc_gem_gpu_updates print reloc data GEM by GEM
+ *     * I965_DECODE_VS : if 0, vertex shader binaries are written to file;
+ *                        otherwise their disassembly is emitted
+ *     * I965_DECODE_HS : if 0, hull shader binaries are written to file;
+ *                        otherwise their disassembly is emitted
+ *     * I965_DECODE_DS : if 0, domain shader binaries are written to file;
+ *                        otherwise their disassembly is emitted
+ *     * I965_DECODE_GS : if 0, geomtry shader binaries are written to file;
+ *                        otherwise their disassembly is emitted
+ *     * I965_DECODE_PS8 : if 0, 8-wide pixel shader binaries are written
+ *                         to file; otherwise their disassembly is emitted
+ *     * I965_DECODE_PS16 : if 0, 16-wide pixel shader binaries are written
+ *                          to file; otherwise their disassembly is emitted
+ *     * I965_DECODE_PS32 : if 0, 32-wide pixel shader binaries are written
+ *                          to file; otherwise their disassembly is emitted
+ *     * I965_DECODE_CS : if 0, media/compute shader binaries are written to
+ *                        file; otherwise their disassembly is emitted
+ */
+
+namespace {
+
+bool
+is_header_field(struct gen_group *group, struct gen_field *field)
+{
+   uint32_t bits;
+
+   if (field->start >= 32)
+      return false;
+
+   bits = (1U << (field->end - field->start + 1)) - 1;
+   bits <<= field->start;
+
+   return (group->opcode_mask & bits) != 0;
+}
+
+template<typename T>
+T
+read_from_environment(const char *env, T default_value)
+{
+  const char *tmp;
+  T return_value(default_value);
+
+  tmp = std::getenv(env);
+  if (tmp != nullptr) {
+     std::string str(tmp);
+     std::istringstream istr(tmp);
+     istr >> return_value;
+  }
+
+  return return_value;
+}
+
+class BatchbufferLoggerOutput;
+class APIStartCallMarker;
+class GEMBufferObject;
+class GPUCommandFieldValue;
+class GPUAddressQuery;
+class GPUCommand;
+class i965LatchState;
+class i965Registers;
+class i965HWContextData;
+class GPUState;
+class BatchRelocs;
+class ShaderFileList;
+class BatchbufferDecoder;
+class BatchbufferLog;
+class GEMBufferTracker;
+
+/* BatchbufferLoggerOutput
+ */
+class BatchbufferLoggerOutput {
+public:
+   explicit
+   BatchbufferLoggerOutput(void):
+      m_file(nullptr),
+      m_current_block_level(0)
+   {}
+
+   ~BatchbufferLoggerOutput();
+
+   void
+   open(const char *filename);
+
+   void
+   close(void);
+
+   operator bool() const
+   {
+      return m_file != nullptr;
+   }
+
+   void
+   begin_block(const char *txt);
+
+   void
+   begin_block_value(const char *txt, const char *fmt, ...);
+
+   void
+   vbegin_block_value(const char *txt, const char *fmt, va_list va);
+
+   void
+   end_block(void);
+
+   void
+   clear_block_stack(unsigned int desired_depth = 0);
+
+   unsigned int
+   current_block_level(void)
+   {
+      return m_current_block_level;
+   }
+
+   const std::string&
+   filename(void) const
+   {
+      return m_filename;
+   }
+
+   void
+   print_value(const char *name, const char *fmt, ...);
+
+   void
+   vprint_value(const char *name, const char *fmt, va_list va);
+
+   template<typename F>
+   void
+   functor_print_value(const char *name, F f, bool pre_emit_eol);
+
+private:
+   BatchbufferLoggerOutput(const BatchbufferLoggerOutput &obj) = delete;
+
+   BatchbufferLoggerOutput&
+   operator=(const BatchbufferLoggerOutput &rhs) = delete;
+
+   void
+   write_name_value(enum i965_batchbuffer_logger_message_type_t tp,
+                    const char *name, const char *fmt,
+                    va_list va);
+
+   std::FILE *m_file;
+   std::string m_filename;
+   unsigned int m_current_block_level;
+};
+
+/* An APIStartCallMarker gives the details of an API call
+ * together with "where" in the batchbuffer the API
+ * call started.
+ */
+class APIStartCallMarker {
+public:
+   APIStartCallMarker(int call_id,
+                      bool print_element,
+                      const char *api_call,
+                      const char *api_call_details,
+                      uint32_t t):
+      m_call_id(call_id),
+      m_api_call(api_call),
+      m_api_call_details(api_call_details),
+      m_start_bb_location(t),
+      m_print_element(print_element)
+   {}
+
+   /* on emit, changes value of m_print_element to false
+    * so that the element is not emitted again, returns
+    * true if m_print_element was true at call entry
+    */
+   bool
+   emit(uint32_t next_entry_start_bb_location,
+        BatchbufferLoggerOutput &dst, unsigned int top_level);
+
+   uint32_t
+   start_bb_location(void) const
+   {
+      return m_start_bb_location;
+   }
+
+   int
+   call_id(void) const
+   {
+      return m_call_id;
+   }
+
+   void
+   add_ioctl_log_entry(const std::string &entry)
+   {
+      m_ioctl_log.push_back(entry);
+   }
+
+   static
+   void
+   print_ioctl_log(const std::list<std::string> &ioctl_log,
+                   BatchbufferLoggerOutput &dst);
+
+private:
+   /* the ID number for the call */
+   int m_call_id;
+
+   /* name of the API call */
+   std::string m_api_call;
+
+   /* details of the API call */
+   std::string m_api_call_details;
+
+   /* location in the batchbuffer at the time
+    * the marker was made.
+    */
+   uint32_t m_start_bb_location;
+
+   /* true if element is within the print window */
+   bool m_print_element;
+
+   /* additional log-messages that come from ioctl's */
+   std::list<std::string> m_ioctl_log;
+};
+
+class GEMBufferObject {
+public:
+   /* Value passed is the value AFTER the ioctl
+    * DRM_IOCTL_I915_GEM_CREATE; the kernel passes
+    * pack the struct modified
+    */
+   explicit
+   GEMBufferObject(int fd, const struct drm_i915_gem_create &pdata);
+
+   /* Value passed is the value AFTER the ioctl
+    * DRM_IOCTL_I915_GEM_CREATE; the kernel passes
+    * pack the struct modified
+    */
+   explicit
+   GEMBufferObject(int fd, const struct drm_i915_gem_userptr &pdata);
+
+   /* To be called -BEFORE- the ioctl DRM_IOCTL_GEM_CLOSE of
+    * the GEM
+    */
+   ~GEMBufferObject();
+
+   /* Handle to the GEM BO */
+   uint32_t
+   handle(void) const
+   {
+      return m_handle;
+   }
+
+   /* size of GEM BO in bytes */
+   uint64_t
+   size(void) const
+   {
+      return m_size;
+   }
+
+   /* If underlying GEM BO was created with DRM_IOCTL_I915_GEM_USERPTR,
+    * then returns the CPU address of the underlying memory
+    */
+   const void*
+   user_ptr(void) const
+   {
+      return m_user_ptr;
+   }
+
+   /* GPU address of GEM BO, note that until
+    *  update_gpu_address() is called the value
+    *  is 0, which is guaranteed to be incorrect.
+    */
+   uint64_t
+   gpu_address_begin(void) const
+   {
+      return m_gpu_address;
+   }
+
+   /* Gives the GPU address for the very end of the BO */
+   uint64_t
+   gpu_address_end(void) const
+   {
+      return m_size + m_gpu_address;
+   }
+
+   void
+   update_gpu_address(uint64_t new_gpu_address)
+   {
+      m_gpu_address = new_gpu_address;
+   }
+
+   template<typename T = void>
+   const T*
+   cpu_mapped(void) const
+   {
+      return static_cast<const T*>(m_mapped);
+   }
+
+   int
+   pread_buffer(void *dst, uint64_t start, uint64_t sz) const;
+
+private:
+   /* File descriptor of ioctl to make GEM BO */
+   int m_fd;
+
+   uint32_t m_handle;
+   uint64_t m_size;
+   const uint8_t *m_user_ptr;
+
+   /* The buffer mapped; there is a danger that mapping
+    * the buffer without sufficient cache flushing
+    * will give incorrect data; on the other hand,
+    * the gen_decoder interface wants raw pointers
+    * from which to read. Let's hope that cache
+    * flushing is not needed for reading the contents.
+    */
+   void *m_mapped;
+
+   /* the location in the GPU address space of the GEM
+    * object, this is updated by the kernel in the
+    * value drm_i915_gem_exec_object2::offset
+    */
+   uint64_t m_gpu_address;
+};
+
+/* class to extract a value from a gen_field_iterator */
+class GPUCommandFieldValue {
+public:
+   explicit
+   GPUCommandFieldValue(const gen_field_iterator &iter);
+
+   template<typename T>
+   T
+   value(void) const;
+
+   /**
+    * Returns the gen_type as indicated by the gen_field_iterator
+    * used to constructor, value is an (unnamed) enumeration of
+    * gen_type.
+    */
+   unsigned int
+   type(void) const
+   {
+      return m_gen_type;
+   }
+
+private:
+   /* enum values from the unnamed enum in gen_field::type::kind */
+   unsigned int m_gen_type;
+
+   union {
+      /* for types GEN_TYPE_FLOAT, GEN_TYPE_UFIXED and GEN_TYPE_SFIXED */
+      float f;
+
+      /* for type GEN_TYPE_INT */
+      int64_t i;
+
+      /* for types GEN_TYPE_UNKNOWN, GEN_TYPE_UINT,
+       * GEN_TYPE_ADDRESS, GEN_TYPE_OFFSET, GEN_TYPE_ENUM
+       */
+      uint64_t u;
+
+      /* for GEN_TYPE_BOOL
+       */
+      bool b;
+   } m_value;
+
+   /* field extraction routines and helpers taken from
+    * gen_decoder.c
+    */
+   static
+   uint64_t
+   mask(int start, int end)
+   {
+      uint64_t v;
+      v = ~0ULL >> (63 - end + start);
+      return v << start;
+   }
+
+   static
+   void
+   get_start_end_pos(int *start, int *end)
+   {
+      if (*end - *start > 32) {
+         int len = *end - *start;
+         *start = *start % 32;
+         *end = *start + len;
+      } else {
+         *start = *start % 32;
+         *end = *end % 32;
+      }
+   }
+
+   template<typename T>
+   static
+   T
+   field(uint64_t value, int start, int end)
+   {
+      uint64_t v;
+      get_start_end_pos(&start, &end);
+      v = (value & mask(start, end)) >> (start);
+      return static_cast<T>(v);
+   }
+
+   static
+   uint64_t
+   field_address(uint64_t value, int start, int end)
+   {
+      get_start_end_pos(&start, &end);
+      return (value & mask(start, end));
+   }
+};
+
+
+/* Return results for getting the GEMBufferObject
+ * and offset into the GEMBufferObject of a GPU
+ * address
+ */
+class GPUAddressQuery {
+public:
+   GPUAddressQuery(void):
+      m_gem_bo(nullptr),
+      m_offset_into_gem_bo(-1)
+   {}
+
+   GEMBufferObject *m_gem_bo;
+   uint64_t m_offset_into_gem_bo;
+};
+
+/* A GPUCommand is a location within a GEM BO
+ * specifying where a GPU command is.
+ */
+class GPUCommand {
+public:
+   /* when saving GPUCommand's that set GPU state, we key
+    * the value by the op-code of the GPU command.
+    */
+   typedef uint32_t state_key;
+
+   /* what we do with the GPUCommand on absorbing it:
+    *  - save the value as state and do not print it immediately
+    *  - print it immediately and show current GPU state
+    *  - print it immediately and do now show current GPU state
+    */
+   enum gpu_command_type_t {
+      gpu_command_save_value_as_state_hw_context,
+      gpu_command_save_value_as_state_not_hw_context,
+      gpu_command_set_register,
+      gpu_command_show_value_with_gpu_state,
+      gpu_command_show_value_without_gpu_state,
+   };
+
+   /* only defined for gpu_decode_type_t values
+    * gpu_save_value_as_state and gpu_show_value_with_gpu_state
+    */
+   enum gpu_pipeline_type_t {
+      gpu_pipeline_compute,
+      gpu_pipeline_gfx,
+   };
+
+   GPUCommand(void);
+
+   /* if grp is nullptr, then read use spec and the contents
+    * at the location to figure out what is the GPU command.
+    */
+   GPUCommand(const GEMBufferObject *q, uint64_t dword_offset,
+              struct gen_spec *spec, struct gen_group *grp = nullptr);
+
+   GPUCommand(const GPUAddressQuery &q, struct gen_spec *spec,
+              struct gen_group *grp = nullptr);
+
+   const uint32_t*
+   contents_ptr(void) const
+   {
+      return m_contents;
+   }
+
+   uint32_t
+   operator[](unsigned int I) const
+   {
+      assert(I < contents_size());
+      return m_contents[I];
+   }
+
+   uint32_t
+   content(unsigned int I) const
+   {
+      assert(I < contents_size());
+      return m_contents[I];
+   }
+
+   unsigned int
+   contents_size(void) const
+   {
+      return m_dword_length;
+   }
+
+   struct gen_group*
+   inst(void) const
+   {
+      return m_inst;
+   }
+
+   const GEMBufferObject*
+   gem_bo(void) const
+   {
+      return m_gem_bo;
+   }
+
+   uint64_t
+   offset(void) const
+   {
+      return m_gem_bo_offset;
+   }
+
+   uint64_t
+   dword_offset(void) const
+   {
+      return offset() / sizeof(uint32_t);
+   }
+
+   enum gpu_command_type_t
+   gpu_command_type(void) const
+   {
+      return m_command_type;
+   }
+
+   enum gpu_pipeline_type_t
+   gpu_pipeline_type(void) const
+   {
+      return m_pipeline_type;
+   }
+
+   /* read a GPU address from a location within the GPUCommand */
+   uint64_t
+   get_gpu_address(const BatchRelocs &relocs,
+                   uint64_t dword_offset_from_cmd_start,
+                   bool ignore_lower_12_bits = true) const;
+
+   /* Sets up the GPUCommand to read data from an internal storage
+    * instead of from the GEM BO.
+    */
+   void
+   archive_data(const BatchRelocs &relocs);
+
+   /* Returns true if and only if the GPUCommand is reading data
+    * from internal storage instead of from the GEM BO.
+    */
+   bool
+   is_archived(void) const
+   {
+      return m_archived_data.size() == m_dword_length;
+   }
+
+   /* Extract the value of a field from a GPUCommand, saving
+    * the value in dst. Returns true on success and false
+    * on failure.
+    */
+   template<typename T>
+   bool
+   extract_field_value(const char *pname, T *dst) const;
+
+private:
+   static
+   enum gpu_command_type_t
+   get_gpu_command_type(struct gen_group *inst);
+
+   static
+   enum gpu_pipeline_type_t
+   get_gpu_pipeline_type(struct gen_group *inst);
+
+   void
+   complete_init(uint32_t dword_offset, struct gen_spec *spec,
+                 struct gen_group *grp);
+
+   const GEMBufferObject *m_gem_bo;
+   uint64_t m_gem_bo_offset;
+   struct gen_group *m_inst;
+   const uint32_t *m_contents;
+   unsigned int m_dword_length;
+   enum gpu_command_type_t m_command_type;
+   enum gpu_pipeline_type_t m_pipeline_type;
+   std::vector<uint32_t> m_archived_data;
+};
+
+/* A significant amount of state on i965 depends deeply on other
+ * portions of state for decoding. The biggest example being
+ * the values in STATE_BASE_ADDRESS.
+ */
+class i965LatchState {
+public:
+   class per_stage_values {
+   public:
+      per_stage_values(void):
+         m_binding_table_count(-1),
+         m_sampler_count(-1)
+      {}
+
+      int m_binding_table_count;
+      int m_sampler_count;
+   };
+
+   i965LatchState(void);
+
+   void
+   update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                const GPUCommand &q);
+
+   /* Tracking STATE_BASE_ADDRESS */
+   uint64_t m_general_state_base_address;
+   uint64_t m_surface_state_base_address;
+   uint64_t m_dynamic_state_base_address;
+   uint64_t m_instruction_base_address;
+
+   /* value derived from 3D_STATE_XS */
+   int m_VIEWPORT_count;
+   per_stage_values m_VS, m_HS, m_DS, m_GS, m_PS, m_CS;
+
+private:
+   void
+   update_stage_values(BatchbufferDecoder *decoder,
+                       BatchbufferLoggerOutput &pfile,
+                       const GPUCommand &q, per_stage_values *dst);
+
+   static
+   void
+   update_state_base_address_helper(const GPUCommand &q,
+                                    const char *value_enabled_name,
+                                    uint64_t *dst, const char *value_name);
+
+   void
+   update_state_base_address(BatchbufferDecoder *decoder,
+                             BatchbufferLoggerOutput &pfile,
+                             const GPUCommand &q);
+};
+
+/* A simple container to track the value of registers.
+ */
+class i965Registers {
+public:
+   i965Registers(void)
+   {}
+
+   void
+   update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                const GPUCommand &q);
+
+   void
+   decode_contents(BatchbufferDecoder *decoder,
+                   enum GPUCommand::gpu_pipeline_type_t pipeline,
+                   BatchbufferLoggerOutput &pfile);
+
+private:
+   /* register values are part of state, the key
+    * to the map is the register offset and the value
+    * is the value of the register.
+    */
+   std::map<uint32_t, uint32_t> m_register_values;
+};
+
+/* The execbuffer2 ioctls, (DRM_IOCTL_I915_GEM_EXECBUFFER2
+ * and DRM_IOCTL_I915_GEM_EXECBUFFER2_WR) can pass a HW
+ * context (via a uint32_t). When a driver uses a HW context,
+ * it can avoid sending large amounts of state commands to
+ * restore state. However, when we decode a batchbuffer,
+ * we need to record HW state that impacts decoding
+ * batchbuffers. The Bspec page to examine for what is
+ * saved and restored in a HW context is at
+ * gfxspecs.intel.com/Predator/Home/Index/20855
+ */
+class i965HWContextData {
+public:
+   explicit
+   i965HWContextData(uint32_t ctx_id);
+   ~i965HWContextData();
+
+   void
+   decode_contents(BatchbufferDecoder *decoder,
+                   enum GPUCommand::gpu_pipeline_type_t pipeline,
+                   BatchbufferLoggerOutput &pfile);
+
+   void
+   update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                const GPUCommand &Q);
+
+   /* Batchbuffer decoding needs to examine and change
+    * the values in i965LatchState when decoding some
+    * elements of state.
+    */
+   i965LatchState m_latch_state;
+
+private:
+   uint32_t m_ctx_id;
+   std::map<GPUCommand::state_key, GPUCommand> m_state;
+   i965Registers m_registers;
+};
+
+class GPUState {
+public:
+   explicit
+   GPUState(i965HWContextData *ctx):
+      m_ctx_data(ctx)
+   {}
+
+   void
+   update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                const GPUCommand &Q);
+
+   void
+   decode_contents(BatchbufferDecoder *decoder,
+                   enum GPUCommand::gpu_pipeline_type_t pipeline,
+                   BatchbufferLoggerOutput &pfile);
+
+   i965HWContextData&
+   ctx(void) const
+   {
+      return *m_ctx_data;
+   }
+
+private:
+   /* holder for state of the GW context */
+   i965HWContextData *m_ctx_data;
+
+   /* state that is not saved in the HW context */
+   std::map<GPUCommand::state_key, GPUCommand> m_state;
+   i965Registers m_registers;
+};
+
+/* A BatchRelocs tracks the relocation data reported back
+ * from the kernel after an ioctl
+ */
+class BatchRelocs {
+public:
+   explicit
+   BatchRelocs(gen_spec *spec):
+      m_32bit_gpu_addresses(spec && gen_spec_get_gen(spec) < gen_make_gen(8, 0))
+   {
+   }
+
+   void
+   add_entry(const GEMBufferObject *gem,
+             uint64_t offset_into_gem,
+             uint64_t gpu_address)
+   {
+      m_relocs[gem][offset_into_gem] = gpu_address;
+   }
+
+   /* Write into dst any relocations found from the given GEM,
+    * with dst representing the offset in -bytes- from the start
+    * of the GEM.
+    */
+   void
+   place_relocation_values_into_buffer(const GEMBufferObject *gem, uint64_t gem_bo_offset,
+                                       std::vector<uint32_t> *dst) const;
+
+   /* Decode/get a GPU address from a location in a GEMBufferObject
+    *   - dword_offset in units of uint32_t's
+    *   - if ignore_lower_12_bts is true, then low 12-bits of the
+    *     passed gpu-address are ignored and the fetch is as if
+    *     they are zero
+    */
+   uint64_t
+   get_gpu_address(const GEMBufferObject *q, uint64_t dword_offset,
+                   const uint32_t *p, bool ignore_lower_12_bits = true) const;
+
+
+   void
+   emit_reloc_data(BatchbufferLoggerOutput &pfile);
+
+private:
+   bool m_32bit_gpu_addresses;
+
+   /* m_relocs[p] gives how to potentially reinterpret GPU addresses
+    * when reading from buffer object p. That list is an std::map
+    * keyed by offsets into p with values as the correct address
+    * at that offset.
+    */
+   typedef std::map<uint64_t, uint64_t> reloc_map_of_gem_bo;
+   typedef std::map<const GEMBufferObject*, reloc_map_of_gem_bo> reloc_map;
+   reloc_map m_relocs;
+};
+
+/* A ShaderFileList acts a map from shaders to filenames.
+ * A hash value is used as the key of the map. If contents
+ * of a shader are not found, then a new entry is made.
+ */
+class ShaderFileList
+{
+public:
+   ShaderFileList(void):
+      m_count(0)
+   {}
+
+   const char*
+   filename(const std::string &fileprefix, const void *shader,
+            int pciid, struct gen_disasm *gen_disasm);
+
+   void
+   clear(void)
+   {
+      m_count = 0;
+      m_files.clear();
+   }
+
+private:
+   typedef std::array<unsigned char, 20> sha1_value;
+
+   ShaderFileList(const ShaderFileList&) = delete;
+
+   ShaderFileList&
+   operator=(const ShaderFileList &rhs) = delete;
+
+   int m_count;
+   std::map<sha1_value, std::string> m_files;
+};
+
+/* A BatchbufferDecoder assists in the decoding the contents
+ * of a batchbuffer, using the machinery in a GEMBufferTracker
+ * to correctly read the contents of indirect state.
+ */
+class BatchbufferDecoder {
+public:
+   enum decode_level_t {
+      no_decode,
+      instruction_decode,
+      instruction_details_decode
+   };
+
+   enum print_reloc_level_t {
+      print_reloc_nothing,
+      print_reloc_gem_gpu_updates,
+   };
+
+   /* enumeration that gives what bit on shader decode */
+   enum shader_decode_entry_t {
+      shader_decode_vs,
+      shader_decode_hs,
+      shader_decode_ds,
+      shader_decode_gs,
+      shader_decode_ps_8,
+      shader_decode_ps_16,
+      shader_decode_ps_32,
+      shader_decode_media_compute,
+
+      shader_decode_entry_count,
+   };
+
+   BatchbufferDecoder(enum decode_level_t decode_level,
+                      enum print_reloc_level_t print_reloc_level,
+                      uint32_t shader_decode_flags,
+                      struct gen_spec *spec,
+                      struct gen_disasm *dis,
+                      int pciid,
+                      GEMBufferTracker *tracker,
+                      ShaderFileList *shader_filelist,
+                      struct drm_i915_gem_execbuffer2 *execbuffer2);
+
+   void
+   absorb_batchbuffer_contents(bool printing_enabled,
+                               BatchbufferLoggerOutput &pfile,
+                               unsigned int start_dword, unsigned int end_dword);
+
+   void
+   decode_gpu_command(BatchbufferLoggerOutput &pfile, const GPUCommand &q);
+
+   const GEMBufferTracker&
+   tracker(void) const
+   {
+      return *m_tracker;
+   }
+
+   const GEMBufferObject*
+   batchbuffer(void)
+   {
+      return m_batchbuffer;
+   }
+
+   const BatchRelocs&
+   relocs(void) const
+   {
+      return m_relocs;
+   }
+
+   BatchbufferLog*
+   batchbuffer_log(void)
+   {
+      return m_batchbuffer_log;
+   }
+
+   struct gen_spec*
+   spec(void) const
+   {
+      return m_spec;
+   }
+
+   void
+   emit_log(BatchbufferLoggerOutput &file);
+
+private:
+   class DetailedDecoder
+   {
+   public:
+      static
+      void
+      decode(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+             const GPUCommand &data);
+
+   private:
+      typedef void (BatchbufferDecoder::*fcn)(BatchbufferLoggerOutput &pfile,
+                                              const GPUCommand &data);
+
+      DetailedDecoder(void);
+
+      DetailedDecoder(const DetailedDecoder &obj) = delete;
+
+      DetailedDecoder&
+      operator=(const DetailedDecoder &rhs) = delete;
+
+      /* keyed by op-code */
+      std::map<uint32_t, fcn> m_elements;
+   };
+
+   void
+   build_driver_values(void);
+
+   void
+   decode_gen_group(BatchbufferLoggerOutput &pfile,
+                    const GEMBufferObject *q, uint64_t offset,
+                    const uint32_t *p, struct gen_group *inst);
+
+   void
+   decode_gpu_execute_command(BatchbufferLoggerOutput &pfile,
+                              const GPUCommand &q);
+
+   void
+   process_gpu_command(bool printing_enabled,
+                       BatchbufferLoggerOutput &pfile,
+                       const GPUCommand &q);
+
+   void
+   decode_pointer_helper(BatchbufferLoggerOutput &pfile,
+                         struct gen_group *g, uint64_t gpu_address);
+
+   void
+   decode_pointer_helper(BatchbufferLoggerOutput &pfile,
+                         const char *instruction_name,
+                         uint64_t gpu_address);
+
+   void
+   decode_shader(BatchbufferLoggerOutput &pfile,
+                 enum shader_decode_entry_t tp, uint64_t gpu_address);
+
+   void
+   decode_3dstate_binding_table_pointers(BatchbufferLoggerOutput &pfile,
+                                         const std::string &label, uint32_t offset,
+                                         int cnt);
+
+   void
+   decode_3dstate_sampler_state_pointers_helper(BatchbufferLoggerOutput &pfile,
+                                                uint32_t offset, int cnt);
+
+   void
+   decode_media_interface_descriptor_load(BatchbufferLoggerOutput &pfile,
+                                          const GPUCommand &data);
+
+   void
+   decode_3dstate_xs(BatchbufferLoggerOutput &pfile,
+                     const GPUCommand &data);
+
+   void
+   decode_3dstate_ps(BatchbufferLoggerOutput &pfile,
+                     const GPUCommand &data);
+
+   void
+   decode_3dstate_constant(BatchbufferLoggerOutput &pfile,
+                           const GPUCommand &data);
+
+   void
+   decode_3dstate_binding_table_pointers_vs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_binding_table_pointers_ds(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_binding_table_pointers_hs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_binding_table_pointers_gs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_binding_table_pointers_ps(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_vs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_gs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_hs(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_ds(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_ps(BatchbufferLoggerOutput &pfile,
+                                            const GPUCommand &data);
+
+   void
+   decode_3dstate_sampler_state_pointers_gen6(BatchbufferLoggerOutput &pfile,
+                                              const GPUCommand &data);
+
+   void
+   decode_3dstate_viewport_state_pointers_cc(BatchbufferLoggerOutput &pfile,
+                                             const GPUCommand &data);
+
+   void
+   decode_3dstate_viewport_state_pointers_sf_clip(BatchbufferLoggerOutput &pfile,
+                                                  const GPUCommand &data);
+
+   void
+   decode_3dstate_blend_state_pointers(BatchbufferLoggerOutput &pfile,
+                                       const GPUCommand &data);
+
+   void
+   decode_3dstate_cc_state_pointers(BatchbufferLoggerOutput &pfile,
+                                    const GPUCommand &data);
+
+   void
+   decode_3dstate_scissor_state_pointers(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data);
+
+   enum decode_level_t m_decode_level;
+   enum print_reloc_level_t m_print_reloc_level;
+   uint32_t m_shader_decode_flags;
+   struct gen_spec *m_spec;
+   struct gen_disasm *m_gen_disasm;
+   int m_pci_id;
+   GEMBufferTracker *m_tracker;
+   ShaderFileList *m_shader_filelist;
+   const GEMBufferObject *m_batchbuffer;
+   BatchbufferLog *m_batchbuffer_log;
+   std::vector<GEMBufferObject*> m_buffers;
+   bool m_reloc_handles_are_indices;
+   GPUState m_gpu_state;
+   BatchRelocs m_relocs;
+   struct drm_i915_gem_execbuffer2 *m_execbuffer2;
+};
+
+/* The type to hold the log associated to a single batchbuffer
+ */
+class BatchbufferLog {
+public:
+   BatchbufferLog(int fd, const void *driver_data, uint32_t h)
+   {
+      m_src.gem_bo = h;
+      m_src.fd = fd;
+      m_src.driver_data = driver_data;
+   }
+
+   const struct i965_logged_batchbuffer*
+   src(void) const
+   {
+      return &m_src;
+   }
+
+   bool //returns true if the last entry in the list was printed to the file
+   emit_log(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &file,
+            uint32_t batchbuffer_len);
+
+   void
+   add_call_marker(bool should_print,
+                   BatchbufferLog &dummy, unsigned int call_id,
+                   const char *fcn_name, const char *call_detailed,
+                   uint32_t bb_location)
+   {
+      if (this != &dummy) {
+         m_prints_from_dummy.splice(m_prints_from_dummy.end(),
+                                    dummy.m_prints);
+      }
+      APIStartCallMarker ap(call_id, should_print, fcn_name, call_detailed,
+                            bb_location);
+      m_prints.push_back(ap);
+   }
+
+   void
+   clear(void)
+   {
+      m_prints.clear();
+      m_prints_from_dummy.clear();
+   }
+
+   bool
+   empty(void) const
+   {
+      return m_prints.empty() && m_prints_from_dummy.empty();
+   }
+
+   void
+   add_ioctl_log_entry(const std::string &entry);
+
+private:
+   void
+   handle_batchbuffer_contents(bool print_enabled,
+                               BatchbufferDecoder *decoder, BatchbufferLoggerOutput &dst,
+                               uint32_t start, uint32_t end);
+
+   friend class GEMBufferTracker;
+
+   /* src parameters of the BatchbufferLog object */
+   struct i965_logged_batchbuffer m_src;
+
+   /* API markers of the batchbuffer */
+   std::list<APIStartCallMarker> m_prints;
+
+   /* For the markers emmitted when there is not active
+    * batchbuffer land in BatchbufferLogger::m_dummy.
+    * The first time BatchbufferLogger has a valid batch
+    * buffer, the merkers of m_dummy are spliced onto
+    * those batchbuffer's log here.
+    */
+   std::list<APIStartCallMarker> m_prints_from_dummy;
+
+   /* when an ioctl log entry is added but there are no
+    * APIStartCallMarker to which to add it.
+    */
+   std::list<std::string> m_orphan_ioctl_log_entries;
+};
+
+class GEMBufferTracker {
+public:
+   explicit
+   GEMBufferTracker(int fd);
+
+   ~GEMBufferTracker();
+
+   /* Add a GEM BO, to be called after the ioctl
+    * DRM_IOCTL_I915_GEM_CREATE returns with the
+    * kernel modified drm_i915_gem_create value
+    */
+   void
+   add_gem_bo(const struct drm_i915_gem_create &pdata);
+
+   /* Add a GEM BO, to be called after the ioctl
+    * DRM_IOCTL_I915_GEM_USERPTR returns with the
+    * kernel modified drm_i915_gem_userptr value
+    */
+   void
+   add_gem_bo(const struct drm_i915_gem_userptr &pdata);
+
+   /* remove a GEM BO from tracking */
+   void
+   remove_gem_bo(uint32_t h);
+
+   /* Fetch a GEMBufferObject given a GEM handle */
+   GEMBufferObject*
+   fetch_gem_bo(uint32_t h) const;
+
+   /* Add a new HW GEM context for tracking */
+   void
+   add_hw_context(const struct drm_i915_gem_context_create &create);
+
+   /* remove a HW GEM context for tracking */
+   void
+   remove_hw_context(const struct drm_i915_gem_context_destroy &destroy);
+
+   /* fetch a GEM HW context from a handle */
+   i965HWContextData*
+   fetch_hw_context(uint32_t h);
+
+   /* to be called just after the ioctl
+    * DRM_IOCTL_I915_GEM_EXECBUFFER2 or
+    * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR
+    * is issued passing the GEM BO list
+    * modified by the kernel; returns what
+    * GEMBufferObject had the GEM handle
+    * and if the GPU address did get changed
+    */
+   std::pair<bool, GEMBufferObject*>
+   update_gem_bo_gpu_address(const struct drm_i915_gem_exec_object2 *p);
+
+   /* Return what GEM BO and offset into
+    * that GEM BO for a given GPU address.
+    */
+   GPUAddressQuery
+   get_memory_at_gpu_address(uint64_t) const;
+
+   /* Use kernel interface pread to read contents */
+   int
+   pread_buffer(void *dst, uint64_t gpu_address, uint64_t size) const;
+
+   /* Get mapped of a GEM BO given from a GPU Address */
+   template<typename T>
+   const T*
+   cpu_mapped(uint64_t gpu_address, GPUAddressQuery *q = nullptr);
+
+   /* Fetch (or create) a BatchbufferLog given a
+    * GEM handle and an opaque pointer provided by the
+    * driver for a batchbuffer.
+    */
+   BatchbufferLog*
+   fetch_or_create(const void *opaque_bb, uint32_t gem_handle);
+
+   /* Fetch a BatchbufferLog given a GEM handle, if
+    * no BatchbufferLog exists, then return nullptr
+    */
+   BatchbufferLog*
+   fetch(uint32_t gem_handle);
+
+   /* remove a BatchbufferLog from tracking */
+   void
+   remove_batchbuffer_log(const BatchbufferLog *q);
+
+   /* Emit any to BatchbufferLog object a file that remain and also remove
+    * them.
+    */
+   void
+   emit_unemitted_log(BatchbufferLoggerOutput &dst);
+
+private:
+   int m_fd;
+
+   /* GEM BO's keyed by DRM handle */
+   std::map<uint32_t, GEMBufferObject*> m_gem_bos_by_handle;
+
+   /* GEM BO's keyed by the GPU address of the end of the GEM BO*/
+   std::map<uint64_t, GEMBufferObject*> m_gem_bos_by_gpu_address_end;
+
+   /* HW contexts keyed by DRM handle */
+   std::map<uint32_t, i965HWContextData> m_hw_contexts;
+
+   /* dummy HW context for execbuffer calls without hw
+    * context, value is reset each it is fetched
+    */
+   i965HWContextData m_dummy_hw_ctx;
+
+   /* backing storage for the logs, keyed by
+    * batchbuffer DRM handle
+    */
+   std::map<uint32_t, BatchbufferLog> m_logs;
+};
+
+class BatchbufferLogger:
+      public i965_batchbuffer_logger,
+      public i965_batchbuffer_logger_app {
+public:
+   static
+   BatchbufferLogger*
+   acquire(void);
+
+   static
+   void
+   release(void);
+
+   static
+   int
+   local_drm_ioctl(int fd, unsigned long request, void *argp);
+
+   void
+   set_driver_funcs(int pci_id,
+                    i965_logged_batchbuffer_state f1,
+                    i965_active_batchbuffer f2);
+
+   void
+   pre_process_ioctl(int fd, unsigned long request, void *argp);
+
+   void
+   post_process_ioctl(int ioctl_return_code, int fd, unsigned long request, void *argp);
+
+private:
+   BatchbufferLogger(void);
+   ~BatchbufferLogger();
+
+   GEMBufferTracker*
+   gem_buffer_tracker(int fd);
+
+   /* Returns nullptr if fd is -1 or if the
+    * GEMBufferTracker associated to the fd
+    * does not have a BatchbufferLog of
+    * the given gem_bo
+    */
+   BatchbufferLog*
+   fetch_batchbuffer_log(int fd, uint32_t gem_bo)
+   {
+      /* We do NOT want to create a BatchbufferLog
+       * object, thus we use the call that only fetches
+       * and does not create.
+       */
+      return (fd != -1) ?
+         gem_buffer_tracker(fd)->fetch(gem_bo) :
+         nullptr;
+   }
+
+   /* if fd is -1, then returns the dummy BatchbufferLog,
+    * otherwise fetches_or_crates a BatchbufferLog from
+    * the fields of the passed batchbuffer
+    */
+   BatchbufferLog*
+   fetch_or_create_batchbuffer_log(const struct i965_logged_batchbuffer *batchbuffer)
+   {
+      int fd;
+      fd = (batchbuffer != nullptr) ? batchbuffer->fd : -1;
+      return (fd != -1) ?
+         gem_buffer_tracker(fd)->fetch_or_create(batchbuffer->driver_data, batchbuffer->gem_bo) :
+         &m_dummy;
+   }
+
+   /* Calls m_active_batchbuffer to get the value of
+    * the active batchbuffer and uses that.
+    */
+   BatchbufferLog*
+   fetch_or_create_batchbuffer_log(void)
+   {
+      struct i965_logged_batchbuffer bb;
+      m_active_batchbuffer(&bb);
+      return fetch_or_create_batchbuffer_log(&bb);
+   }
+
+   static
+   void
+   aborted_batchbuffer_fcn(struct i965_batchbuffer_logger*, int fd, uint32_t gem_bo);
+
+   static
+   void
+   release_driver_fcn(struct i965_batchbuffer_logger *pthis);
+
+   static
+   void
+   pre_call_fcn(struct i965_batchbuffer_logger_app *pthis,
+                unsigned int call_id,
+                const char *call_detailed,
+                const char *fcn_name);
+
+   static
+   void
+   post_call_fcn(struct i965_batchbuffer_logger_app *pthis,
+                 unsigned int call_id);
+
+   static
+   void
+   end_logging_fcn(struct i965_batchbuffer_logger_app *pthis);
+
+   static
+   void
+   begin_logging_fcn(struct i965_batchbuffer_logger_app *pthis,
+                     const char *name);
+
+   static
+   void
+   release_app_fcn(struct i965_batchbuffer_logger_app *pthis);
+
+   static
+   uint32_t
+   default_batchbuffer_state_fcn(const struct i965_logged_batchbuffer *st)
+   {
+      return 0;
+   }
+
+   static
+   void
+   default_active_batchbuffer_fcn(struct i965_logged_batchbuffer *st)
+   {
+      st->fd = -1;
+      st->gem_bo = ~0u;
+      st->driver_data = nullptr;
+   }
+
+   /* derived fron enviromental string */
+   unsigned long m_start_log_call_number, m_end_log_call_number;
+   long m_max_file_size;
+   enum BatchbufferDecoder::decode_level_t m_decode_level;
+   enum BatchbufferDecoder::print_reloc_level_t m_print_reloc_level;
+   uint32_t m_shader_decode_flags;
+
+   /* from driver */
+   i965_logged_batchbuffer_state m_batchbuffer_state;
+   i965_active_batchbuffer m_active_batchbuffer;
+   int m_pci_id;
+
+   /* derived data from m_pci_id */
+   struct gen_device_info m_dev_info;
+   struct gen_spec *m_gen_spec;
+   struct gen_disasm *m_gen_disasm;
+
+   /* GEM buffer tracking, keyed by file descriptor */
+   std::map<int, GEMBufferTracker*> m_gem_buffer_trackers;
+
+   ShaderFileList m_shader_filelist;
+
+   /* thread safety guaranteed by std */
+   std::mutex m_mutex;
+
+   /* special dummy batchbuffer; markers are added
+    * to it if there is no active batchbuffer, the
+    * first time we get an active batchbuffer, the
+    * markers on dummy are given to the
+    * BatchbufferLog associated to it.
+    */
+   BatchbufferLog m_dummy;
+   int m_number_aborted_batchbuffers;
+
+   /* output file. */
+   BatchbufferLoggerOutput m_file;
+};
+
+} //namespace
+
+/////////////////////////////////
+//BatchbufferLoggerOutput methods
+BatchbufferLoggerOutput::
+~BatchbufferLoggerOutput()
+{
+   close();
+}
+
+void
+BatchbufferLoggerOutput::
+open(const char *filename)
+{
+   close();
+   m_file = std::fopen(filename, "w");
+   if (m_file) {
+      m_filename = filename;
+   }
+}
+
+void
+BatchbufferLoggerOutput::
+close(void)
+{
+   if (m_file) {
+      clear_block_stack();
+      std::fclose(m_file);
+      m_file = nullptr;
+      m_filename.clear();
+   }
+}
+
+void
+BatchbufferLoggerOutput::
+begin_block(const char *txt)
+{
+   struct i965_batchbuffer_logger_header hdr;
+
+   ++m_current_block_level;
+   hdr.type = I965_BATCHBUFFER_LOGGER_MESSAGE_BLOCK_BEGIN;
+   hdr.name_length = std::strlen(txt);
+   hdr.value_length = 0;
+
+   std::fwrite(&hdr, sizeof(hdr), 1, m_file);
+   std::fwrite(txt, sizeof(char), hdr.name_length, m_file);
+}
+
+void
+BatchbufferLoggerOutput::
+begin_block_value(const char *txt, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   vbegin_block_value(txt, fmt, args);
+   va_end(args);
+}
+
+void
+BatchbufferLoggerOutput::
+vbegin_block_value(const char *txt, const char *fmt, va_list va)
+{
+   ++m_current_block_level;
+   write_name_value(I965_BATCHBUFFER_LOGGER_MESSAGE_BLOCK_BEGIN, txt, fmt,
+                    va);
+}
+
+void
+BatchbufferLoggerOutput::
+end_block(void)
+{
+   if (m_current_block_level > 0) {
+      struct i965_batchbuffer_logger_header hdr;
+
+      hdr.type = I965_BATCHBUFFER_LOGGER_MESSAGE_BLOCK_END;
+      hdr.name_length = 0;
+      hdr.value_length = 0;
+      std::fwrite(&hdr, sizeof(hdr), 1, m_file);
+      --m_current_block_level;
+   }
+}
+
+void
+BatchbufferLoggerOutput::
+clear_block_stack(unsigned int desired_depth)
+{
+   while(m_current_block_level > desired_depth) {
+      end_block();
+   }
+}
+
+void
+BatchbufferLoggerOutput::
+print_value(const char *name, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   vprint_value(name, fmt, args);
+   va_end(args);
+}
+
+void
+BatchbufferLoggerOutput::
+vprint_value(const char *name, const char *fmt, va_list va)
+{
+   write_name_value(I965_BATCHBUFFER_LOGGER_MESSAGE_VALUE, name, fmt, va);
+}
+
+void
+BatchbufferLoggerOutput::
+write_name_value(enum i965_batchbuffer_logger_message_type_t tp,
+                 const char *name, const char *fmt,
+                 va_list va)
+{
+   char buffer[4096];
+   struct i965_batchbuffer_logger_header hdr;
+   va_list va_value;
+
+   va_copy(va_value, va);
+   hdr.type = tp;
+   hdr.name_length = std::strlen(name);
+   hdr.value_length = std::vsnprintf(buffer, sizeof(buffer), fmt, va);
+
+   std::fwrite(&hdr, sizeof(hdr), 1, m_file);
+   std::fwrite(name, sizeof(char), hdr.name_length, m_file);
+   if (hdr.value_length > sizeof(buffer)) {
+      std::vector<char> tmp(hdr.value_length);
+      std::vsnprintf(&tmp[0], tmp.size(), fmt, va_value);
+      std::fwrite(&tmp[0], sizeof(char), tmp.size(), m_file);
+   } else {
+      std::fwrite(buffer, sizeof(char), hdr.value_length, m_file);
+   }
+}
+
+template<typename F>
+void
+BatchbufferLoggerOutput::
+functor_print_value(const char *name, F f, bool pre_emit_eol)
+{
+   struct i965_batchbuffer_logger_header hdr;
+   long header_start, value_start, value_end;
+
+   hdr.type = I965_BATCHBUFFER_LOGGER_MESSAGE_VALUE;
+   hdr.name_length = std::strlen(name);
+   hdr.value_length = 0;
+
+   header_start = std::ftell(m_file);
+   std::fseek(m_file, sizeof(hdr), SEEK_CUR);
+   std::fwrite(name, sizeof(char), hdr.name_length, m_file);
+   value_start = std::ftell(m_file);
+
+   if (pre_emit_eol) {
+      fprintf(m_file, "\n");
+   }
+   f(m_file);
+
+   value_end = std::ftell(m_file);
+   hdr.value_length = value_end - value_start;
+   std::fseek(m_file, header_start, SEEK_SET);
+   std::fwrite(&hdr, sizeof(hdr), 1, m_file);
+   std::fseek(m_file, value_end, SEEK_SET);
+}
+
+////////////////////////////////////
+// APIStartCallMarker methods
+void
+APIStartCallMarker::
+print_ioctl_log(const std::list<std::string> &ioctl_log,
+                BatchbufferLoggerOutput &dst)
+{
+   if (dst && !ioctl_log.empty()) {
+      uint32_t ioctl_message_id;
+      std::list<std::string>::const_iterator iter;
+      for(ioctl_message_id = 0, iter = ioctl_log.begin();
+          iter != ioctl_log.end(); ++iter, ++ioctl_message_id) {
+         std::ostringstream name;
+         name << "IOCTL." << ioctl_message_id;
+         dst.print_value(name.str().c_str(), "%s", iter->c_str());
+      }
+   }
+}
+
+bool
+APIStartCallMarker::
+emit(uint32_t next_entry_start_bb_location,
+     BatchbufferLoggerOutput &dst, unsigned int top_level)
+{
+   bool return_value(m_print_element);
+
+   if (m_print_element) {
+      std::ostringstream str;
+
+      str << "Call." << m_call_id << "." << m_api_call;
+      if (next_entry_start_bb_location > m_start_bb_location) {
+         str << ".CreatedGPUCommands";
+      }
+      dst.clear_block_stack(top_level);
+      dst.begin_block(str.str().c_str());
+      dst.print_value("Call Number", "%d", m_call_id);
+      dst.print_value("Function", "%s", m_api_call.c_str());
+      dst.print_value("Details", "%s", m_api_call_details.c_str());
+      print_ioctl_log(m_ioctl_log, dst);
+      m_print_element = false;
+   }
+
+   return return_value;
+}
+
+//////////////////////////////////
+// GEMBufferObject methods
+GEMBufferObject::
+GEMBufferObject(int fd, const struct drm_i915_gem_create &pdata):
+   m_fd(fd),
+   m_handle(pdata.handle),
+   m_size(pdata.size),
+   m_user_ptr(nullptr),
+   m_gpu_address(0)
+{
+   struct drm_i915_gem_mmap map;
+   int ret;
+
+   std::memset(&map, 0, sizeof(map));
+   map.handle = m_handle;
+   map.offset = 0;
+   map.size = m_size;
+
+   ret = BatchbufferLogger::local_drm_ioctl(m_fd, DRM_IOCTL_I915_GEM_MMAP, &map);
+   if (ret != -1) {
+      m_mapped = (void*) map.addr_ptr;
+   } else {
+      m_mapped = nullptr;
+   }
+}
+
+GEMBufferObject::
+GEMBufferObject(int fd, const struct drm_i915_gem_userptr &pdata):
+   m_fd(fd),
+   m_handle(pdata.handle),
+   m_size(pdata.user_size),
+   m_user_ptr((const uint8_t*)pdata.user_ptr),
+   m_mapped((void*)pdata.user_ptr),
+   m_gpu_address(0)
+{
+}
+
+GEMBufferObject::
+~GEMBufferObject()
+{
+   if (m_mapped && m_mapped != m_user_ptr) {
+      munmap(m_mapped, m_size);
+   }
+}
+
+int
+GEMBufferObject::
+pread_buffer(void *dst, uint64_t start, uint64_t sz) const
+{
+   if (start + sz  > m_size) {
+      return -1;
+   }
+
+   if (!m_user_ptr) {
+      struct drm_i915_gem_pread pread_args;
+      pread_args.handle = m_handle;
+      pread_args.offset = start;
+      pread_args.size = sz;
+      pread_args.data_ptr = (__u64) dst;
+      return BatchbufferLogger::local_drm_ioctl(m_fd, DRM_IOCTL_I915_GEM_PREAD, &pread_args);
+   } else {
+      std::memcpy(dst, m_user_ptr + start, sz);
+      return 0;
+   }
+}
+
+///////////////////////////////
+// GPUCommandFieldValue methods
+GPUCommandFieldValue::
+GPUCommandFieldValue(const gen_field_iterator &iter):
+   m_gen_type(iter.field->type.kind)
+{
+   /* this code is essentially taken from gen_decode.c's function
+    * gen_field_iterator_next(), but rather than printing the value
+    * to a string (iter.value), we extract the value to this's
+    * fields.
+    */
+   union {
+      uint64_t qw;
+      float f;
+   } v;
+
+   if ((iter.field->end - iter.field->start) > 32) {
+      v.qw = ((uint64_t) iter.p[iter.dword + 1] << 32) | iter.p[iter.dword];
+   }
+   else {
+      v.qw = iter.p[iter.dword];
+   }
+
+   switch (iter.field->type.kind) {
+   case gen_type::GEN_TYPE_INT:
+      m_value.i = field<int64_t>(v.qw, iter.field->start, iter.field->end);
+      break;
+   default:
+   case gen_type::GEN_TYPE_UINT:
+   case gen_type::GEN_TYPE_ENUM:
+   case gen_type::GEN_TYPE_UNKNOWN:
+      m_value.u = field<uint64_t>(v.qw, iter.field->start, iter.field->end);
+      break;
+   case gen_type::GEN_TYPE_BOOL:
+      m_value.b = field<bool>(v.qw, iter.field->start, iter.field->end);
+      break;
+   case gen_type::GEN_TYPE_FLOAT:
+      m_value.f = v.f;
+      break;
+   case gen_type::GEN_TYPE_ADDRESS:
+   case gen_type::GEN_TYPE_OFFSET:
+      m_value.u = field_address(v.qw, iter.field->start, iter.field->end);
+      break;
+   case gen_type::GEN_TYPE_UFIXED:
+      m_value.f = field<float>(v.qw, iter.field->start, iter.field->end) / float(1 << iter.field->type.f);
+      break;
+   case gen_type::GEN_TYPE_SFIXED: {
+      uint64_t uv;
+      bool is_negative;
+      uint64_t leading_bit;
+      uv = field<uint64_t>(v.qw, iter.field->start, iter.field->end);
+      leading_bit = iter.field->end - iter.field->start - 1;
+      is_negative = uv & (uint64_t(1) << leading_bit);
+      m_value.f = static_cast<float>(uv) / float(1 << iter.field->type.f);
+      if (is_negative) {
+         m_value.f = -m_value.f;
+      }
+      break;
+   }
+   }
+}
+
+template<typename T>
+T
+GPUCommandFieldValue::
+value(void) const
+{
+   switch(m_gen_type) {
+   case gen_type::GEN_TYPE_INT:
+      return static_cast<T>(m_value.i);
+
+   case gen_type::GEN_TYPE_BOOL:
+      return static_cast<T>(m_value.b);
+
+   case gen_type::GEN_TYPE_FLOAT:
+   case gen_type::GEN_TYPE_UFIXED:
+   case gen_type::GEN_TYPE_SFIXED:
+      return static_cast<T>(m_value.f);
+
+   case gen_type::GEN_TYPE_UINT:
+   case gen_type::GEN_TYPE_ENUM:
+   case gen_type::GEN_TYPE_UNKNOWN:
+   case gen_type::GEN_TYPE_ADDRESS:
+   case gen_type::GEN_TYPE_OFFSET:
+   default:
+      return static_cast<T>(m_value.u);
+   }
+}
+
+/////////////////////////////
+// GPUCommand methods
+GPUCommand::
+GPUCommand(void):
+   m_gem_bo(nullptr),
+   m_gem_bo_offset(-1),
+   m_inst(nullptr),
+   m_contents(nullptr),
+   m_dword_length(0),
+   m_command_type(gpu_command_show_value_without_gpu_state)
+{}
+
+GPUCommand::
+GPUCommand(const GEMBufferObject *q, uint64_t dword_offset, struct gen_spec *spec, struct gen_group *grp):
+   m_gem_bo(q),
+   m_gem_bo_offset(dword_offset * sizeof(uint32_t)),
+   m_dword_length(0),
+   m_command_type(gpu_command_show_value_without_gpu_state),
+   m_pipeline_type(gpu_pipeline_gfx)
+{
+   complete_init(dword_offset, spec, grp);
+}
+
+GPUCommand::
+GPUCommand(const GPUAddressQuery &q, struct gen_spec *spec, struct gen_group *grp):
+   m_gem_bo(q.m_gem_bo),
+   m_gem_bo_offset(q.m_offset_into_gem_bo),
+   m_dword_length(0),
+   m_command_type(gpu_command_show_value_without_gpu_state),
+   m_pipeline_type(gpu_pipeline_gfx)
+{
+   complete_init(m_gem_bo_offset / sizeof(uint32_t), spec, grp);
+}
+
+void
+GPUCommand::
+complete_init(uint32_t dword_offset, struct gen_spec *spec, struct gen_group *grp)
+{
+   int length;
+
+   assert(sizeof(uint32_t) * dword_offset == m_gem_bo_offset);
+
+   m_contents = m_gem_bo->cpu_mapped<uint32_t>() + dword_offset;
+   if(spec && !grp) {
+      m_inst = gen_spec_find_instruction(spec, m_contents);
+   } else {
+      m_inst = grp;
+   }
+
+   if (m_inst) {
+      length = gen_group_get_length(m_inst, m_contents);
+      m_command_type = get_gpu_command_type(m_inst);
+      m_pipeline_type = get_gpu_pipeline_type(m_inst);
+
+      if (length > 0) {
+         m_dword_length = length;
+      }
+   }
+}
+
+template<typename T>
+bool
+GPUCommand::
+extract_field_value(const char *pname, T *dst) const
+{
+   struct gen_field_iterator iter;
+
+   gen_field_iterator_init(&iter, inst(), contents_ptr(), false);
+   while (gen_field_iterator_next(&iter)) {
+      if (!is_header_field(inst(), iter.field) &&
+          0 == strcmp(pname, iter.name)) {
+         GPUCommandFieldValue value(iter);
+
+         assert(!m_archived_data.empty() ||
+                value.type() == gen_type::GEN_TYPE_ADDRESS);
+         *dst = value.value<T>();
+         return true;
+      }
+   }
+
+   return false;
+}
+
+enum GPUCommand::gpu_command_type_t
+GPUCommand::
+get_gpu_command_type(struct gen_group *inst)
+{
+   uint32_t op_code;
+   op_code = gen_group_get_opcode(inst);
+   switch (op_code) {
+   case _MI_LOAD_REGISTER_MEM: //load a register value from a GEM BO
+   case _MI_LOAD_REGISTER_IMM: //load a register value from batchbuffer
+   case _MI_LOAD_REGISTER_REG: //load a register value from another register
+      return gpu_command_set_register;
+
+   case STATE_BASE_ADDRESS:
+      /* because STATE_BASE_ADDRESS has option to set or not set values,
+       * it is not pure state and thus should be printed on encounter
+       */
+   case _3DSTATE_VF_INSTANCING:
+      /* _3DSTATE_VF_INSTANCING sets if a named vertex attribute is
+       * instanced
+       */
+   case _MI_NOOP:
+   case _MI_BATCH_BUFFER_START:
+   case _MI_BATCH_BUFFER_END:
+   case _MI_STORE_REGISTER_MEM: //writes a register value to a GEM BO
+   case _MI_PREDICATE:          //modify predicate value
+   case _MI_ARB_CHECK:
+   case _MI_ATOMIC:
+   case _MI_CLFLUSH:
+   case _MI_CONDITIONAL_BATCH_BUFFER_END:
+   case _MI_COPY_MEM_MEM:
+   case _MI_DISPLAY_FLIP:
+   case _MI_FORCE_WAKEUP:
+   case _MI_LOAD_SCAN_LINES_EXCL:
+   case _MI_LOAD_SCAN_LINES_INCL:
+   case _MI_MATH:
+   case _MI_REPORT_HEAD:
+   case _MI_REPORT_PERF_COUNT:
+   case _MI_RS_CONTEXT:
+   case _MI_RS_CONTROL:
+   case _MI_RS_STORE_DATA_IMM:
+   case _MI_SEMAPHORE_SIGNAL:
+   case _MI_SEMAPHORE_WAIT:
+   case _MI_SET_CONTEXT:
+   case _MI_SET_PREDICATE:
+   case _MI_STORE_DATA_IMM:
+   case _MI_STORE_DATA_INDEX:
+   case _MI_SUSPEND_FLUSH:
+   case _MI_UPDATE_GTT:
+   case _MI_USER_INTERRUPT:
+   case _MI_WAIT_FOR_EVENT:
+   case _3DSTATE_PIPE_CONTROL:  //3d pipeline flushing
+   case MEDIA_STATE_FLUSH:      //compute/media pipeline flushing
+   case _3DSTATE_PIPELINE_SELECT:
+   case _3DSTATE_PIPELINE_SELECT_GM45:
+      return gpu_command_show_value_without_gpu_state;
+
+   case _3DPRIMITIVE:
+   case _GPGPU_WALKER:
+      return gpu_command_show_value_with_gpu_state;
+
+   default:
+      /* TODO: go through state values and correctly tag
+       * what state is part of HW context and what is not.
+       */
+      return gpu_command_save_value_as_state_hw_context;
+   }
+}
+
+enum GPUCommand::gpu_pipeline_type_t
+GPUCommand::
+get_gpu_pipeline_type(struct gen_group *inst)
+{
+   uint32_t op_code;
+   op_code = gen_group_get_opcode(inst);
+   switch (op_code) {
+   case _GPGPU_WALKER:
+   case MEDIA_INTERFACE_DESCRIPTOR_LOAD:
+   case MEDIA_VFE_STATE:
+   case MEDIA_CURBE_LOAD:
+      return gpu_pipeline_compute;
+   default:
+      return gpu_pipeline_gfx;
+   };
+}
+
+uint64_t
+GPUCommand::
+get_gpu_address(const BatchRelocs &relocs,
+                uint64_t dword_offset_from_cmd_start,
+                bool ignore_lower_12_bits) const
+{
+   const uint32_t *p;
+   const GEMBufferObject *gem;
+   uint64_t dword_offset_from_gem_start;
+
+   p = contents_ptr() + dword_offset_from_cmd_start;
+
+   /* recycle the logic/work in BatchRelocs::get_gpu_address(),
+    * for reading a GPU address from memory, but set the
+    * passed GEM BO and offset to a value that should never
+    * be in the reloc data.
+    */
+   gem = (m_archived_data.empty()) ? gem_bo() : nullptr;
+   dword_offset_from_gem_start = (gem) ?
+      dword_offset_from_cmd_start + dword_offset() :
+      ~uint64_t(0);
+
+   return relocs.get_gpu_address(gem, dword_offset_from_gem_start,
+                                 p, ignore_lower_12_bits);
+}
+
+void
+GPUCommand::
+archive_data(const BatchRelocs &relocs)
+{
+   assert(!is_archived());
+   if (m_dword_length > 0) {
+      m_archived_data.resize(m_dword_length);
+      std::copy(m_contents, m_contents + m_dword_length,
+                m_archived_data.begin());
+      relocs.place_relocation_values_into_buffer(m_gem_bo, m_gem_bo_offset,
+                                                 &m_archived_data);
+      m_contents = &m_archived_data[0];
+   }
+}
+
+//////////////////////////////////////////
+// i965LatchState methods
+i965LatchState::
+i965LatchState(void):
+   m_general_state_base_address(0),
+   m_surface_state_base_address(0),
+   m_dynamic_state_base_address(0),
+   m_instruction_base_address(0),
+   m_VIEWPORT_count(-1)
+{}
+
+void
+i965LatchState::
+update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+             const GPUCommand &cmd)
+{
+   GPUCommand::state_key op_code;
+   const GPUCommand *p(&cmd);
+   GPUCommand archived;
+
+   if (!cmd.is_archived()) {
+      archived = cmd;
+      archived.archive_data(decoder->relocs());
+      p = &archived;
+   }
+
+   const GPUCommand &q(*p);
+
+   op_code = gen_group_get_opcode(q.inst());
+   switch(op_code) {
+   case _3DSTATE_VS:
+      update_stage_values(decoder, pfile, q, &m_VS);
+      break;
+   case _3DSTATE_HS:
+      update_stage_values(decoder, pfile, q, &m_HS);
+      break;
+   case _3DSTATE_DS:
+      update_stage_values(decoder, pfile, q, &m_DS);
+      break;
+   case _3DSTATE_GS:
+      update_stage_values(decoder, pfile, q, &m_GS);
+      break;
+   case _3DSTATE_PS:
+      update_stage_values(decoder, pfile, q, &m_PS);
+      break;
+   case STATE_BASE_ADDRESS:
+      update_state_base_address(decoder, pfile, q);
+      break;
+   case _3D_STATE_CLIP: {
+      /* TODO: for GEN5 and before, the maximum number of
+       * viewports in in _3D_STATE_GS
+       */
+      int v;
+      if (q.extract_field_value<int>("Maximum VP Index", &v)) {
+         m_VIEWPORT_count = v + 1;
+      }
+      break;
+   }
+   }
+}
+
+void
+i965LatchState::
+update_stage_values(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                    const GPUCommand &q, per_stage_values *dst)
+{
+   int tmp;
+   if (q.extract_field_value<int>("Sampler Count", &tmp)) {
+      /* 3D_STATE_XS holds the number of sampler divided by 4;
+       * the awful consequence is that then we only know the
+       * number of sampler states to a multiple of 4.
+       */
+      dst->m_sampler_count = 4 * tmp;
+   }
+
+   if (q.extract_field_value<int>("Binding Table Entry Count", &tmp)) {
+      dst->m_binding_table_count = tmp;
+   }
+}
+
+void
+i965LatchState::
+update_state_base_address_helper(const GPUCommand &q,
+                                 const char *value_enabled_name,
+                                 uint64_t *dst, const char *value_name)
+{
+   bool enabled(false);
+   uint64_t v;
+
+   q.extract_field_value<bool>(value_enabled_name, &enabled);
+   if (enabled && q.extract_field_value<uint64_t>(value_name, &v)) {
+      *dst = v & ~uint64_t(0xFFFu);
+   }
+}
+
+void
+i965LatchState::
+update_state_base_address(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+                          const GPUCommand &q)
+{
+   assert(q.is_archived());
+   update_state_base_address_helper(q,
+                                    "General State Base Address Modify Enable",
+                                    &m_general_state_base_address,
+                                    "General State Base Address");
+
+   update_state_base_address_helper(q,
+                                    "Surface State Base Address Modify Enable",
+                                    &m_surface_state_base_address,
+                                    "Surface State Base Address");
+
+   update_state_base_address_helper(q,
+                                    "Dynamic State Base Address Modify Enable",
+                                    &m_dynamic_state_base_address,
+                                    "Dynamic State Base Address");
+
+   update_state_base_address_helper(q,
+                                    "Instruction Base Address Modify Enable",
+                                    &m_instruction_base_address,
+                                    "Instruction Base Address");
+}
+
+///////////////////////////////////////////
+// i965Registers methods
+void
+i965Registers::
+update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+             const GPUCommand &q)
+{
+   GPUCommand::state_key op_code;
+
+   op_code = gen_group_get_opcode(q.inst());
+   switch (op_code) {
+   case _MI_LOAD_REGISTER_MEM: {
+      /* An MEM register means load the register from a GEM BO.
+       * We need to get the register value from the GEM BO
+       * that has the value. DANGER: We are reading the value
+       * from the GEM after the ioctl returns. If the GEM BO
+       * was written to later in the batchbuffer, then our read
+       * here will be the value it was after everything was done,
+       * not when it was used.
+       *
+       * Should we instead record the location and offset of the
+       * value instead?
+       */
+      uint32_t register_offset, register_value;
+      uint64_t gpu_address;
+
+      register_offset = q[1];
+      gpu_address = q.get_gpu_address(decoder->relocs(), 2, false);
+      register_value = decoder->tracker().pread_buffer(&register_value,
+                                                       gpu_address,
+                                                       sizeof(uint32_t));
+      m_register_values[register_offset] = register_value;
+      break;
+   }
+
+   case _MI_LOAD_REGISTER_IMM: {
+      /* An IMM load has the value for the register stored directly
+       * in the batchbuffer, this command can set multiple registers
+       */
+      for (unsigned int i = 1, endi = q.contents_size(); i < endi; i += 2) {
+         uint32_t register_offset, register_value;
+
+         register_offset = q[i];
+         register_value = q[i + 1];
+         m_register_values[register_offset] = register_value;
+      }
+      break;
+   }
+
+   case _MI_LOAD_REGISTER_REG: {
+      /* command means to copy one register to another */
+      uint32_t register_src_offset, register_dst_offset;
+      register_src_offset = q[1];
+      register_dst_offset = q[2];
+      m_register_values[register_dst_offset] = m_register_values[register_src_offset];
+      break;
+   }
+   }
+}
+
+void
+i965Registers::
+decode_contents(BatchbufferDecoder *decoder,
+                enum GPUCommand::gpu_pipeline_type_t pipeline,
+                BatchbufferLoggerOutput &pfile)
+{
+   /* TODO: classify registers as to what part pipeline(s)
+    * they influence
+    */
+   (void)pipeline;
+
+   pfile.begin_block("Register Values");
+   for(const auto v : m_register_values) {
+      struct gen_group *reg;
+      reg = gen_spec_find_register(decoder->spec(), v.first);
+
+      if (reg) {
+         pfile.begin_block_value("Register", "%s", reg->name);
+         pfile.print_value("ID", "(0x%x)", v.first);
+         pfile.print_value("value", "0x%x", v.second);
+      } else {
+         pfile.begin_block_value("Unknown register", "(0x%x)", v.first);
+         pfile.print_value("ID", "(0x%x)", v.first);
+         pfile.print_value("value", "0x%x", v.second);
+      }
+      pfile.end_block();
+   }
+   pfile.end_block();
+}
+
+///////////////////////////////////////////////
+// i965HWContextData methods
+i965HWContextData::
+i965HWContextData(uint32_t ctx_id):
+   m_ctx_id(ctx_id)
+{
+}
+
+i965HWContextData::
+~i965HWContextData()
+{
+}
+
+void
+i965HWContextData::
+update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+             const GPUCommand &q)
+{
+   enum GPUCommand::gpu_command_type_t tp;
+   const GPUCommand *pq(&q);
+
+   tp = q.gpu_command_type();
+
+   switch (tp) {
+   case GPUCommand::gpu_command_save_value_as_state_hw_context: {
+      uint32_t op_code;
+      op_code = gen_group_get_opcode(q.inst());
+
+      GPUCommand &dst(m_state[op_code]);
+      dst = q;
+      dst.archive_data(decoder->relocs());
+      pq = &dst;
+      break;
+   }
+
+   case GPUCommand::gpu_command_set_register: {
+      /* TODO: not all registers are part of context state; some
+       * are global to the entire GPU. Eventually need to adress
+       * that issue.
+       */
+      m_registers.update_state(decoder, pfile, q);
+      break;
+   }
+
+   default:
+      /* TODO: should we track the values set by _3DSTATE_VF_INSTANCING? */
+      break;
+   }
+   m_latch_state.update_state(decoder, pfile, *pq);
+}
+
+void
+i965HWContextData::
+decode_contents(BatchbufferDecoder *decoder,
+                enum GPUCommand::gpu_pipeline_type_t pipeline,
+                BatchbufferLoggerOutput &pfile)
+{
+   pfile.begin_block("State of Context");
+   for(const auto entry : m_state) {
+      if (entry.second.gpu_pipeline_type() == pipeline) {
+         decoder->decode_gpu_command(pfile, entry.second);
+      }
+   }
+   m_registers.decode_contents(decoder, pipeline, pfile);
+   pfile.end_block();
+}
+
+//////////////////////////////////////
+// GPUState methods
+void
+GPUState::
+update_state(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+             const GPUCommand &q)
+{
+   if (q.gpu_command_type() ==
+       GPUCommand::gpu_command_save_value_as_state_not_hw_context) {
+      GPUCommand::state_key op_code;
+      op_code = gen_group_get_opcode(q.inst());
+
+      GPUCommand &dst(m_state[op_code]);
+      dst = q;
+      dst.archive_data(decoder->relocs());
+   } else {
+      m_ctx_data->update_state(decoder, pfile, q);
+   }
+}
+
+void
+GPUState::
+decode_contents(BatchbufferDecoder *decoder,
+                enum GPUCommand::gpu_pipeline_type_t pipeline,
+                BatchbufferLoggerOutput &pfile)
+{
+   m_ctx_data->decode_contents(decoder, pipeline, pfile);
+   if (!m_state.empty()) {
+      pfile.begin_block("State of GPU, not of Context");
+      for(const auto entry : m_state) {
+         if (entry.second.gpu_pipeline_type() == pipeline) {
+            decoder->decode_gpu_command(pfile, entry.second);
+         }
+      }
+      pfile.end_block();
+   }
+}
+
+///////////////////////////////////////////////
+// BatchbufferDecoder::DetailedDecoder methods
+BatchbufferDecoder::DetailedDecoder::
+DetailedDecoder(void)
+{
+   m_elements[MEDIA_INTERFACE_DESCRIPTOR_LOAD] =
+      &BatchbufferDecoder::decode_media_interface_descriptor_load;
+   m_elements[_3DSTATE_VS] = &BatchbufferDecoder::decode_3dstate_xs;
+   m_elements[_3DSTATE_GS] = &BatchbufferDecoder::decode_3dstate_xs;
+   m_elements[_3DSTATE_DS] = &BatchbufferDecoder::decode_3dstate_xs;
+   m_elements[_3DSTATE_HS] = &BatchbufferDecoder::decode_3dstate_xs;
+   m_elements[_3DSTATE_PS] = &BatchbufferDecoder::decode_3dstate_ps;
+
+   m_elements[_3DSTATE_BINDING_TABLE_POINTERS_VS] =
+      &BatchbufferDecoder::decode_3dstate_binding_table_pointers_vs;
+   m_elements[_3DSTATE_BINDING_TABLE_POINTERS_HS] =
+      &BatchbufferDecoder::decode_3dstate_binding_table_pointers_hs;
+   m_elements[_3DSTATE_BINDING_TABLE_POINTERS_DS] =
+      &BatchbufferDecoder::decode_3dstate_binding_table_pointers_ds;
+   m_elements[_3DSTATE_BINDING_TABLE_POINTERS_GS] =
+      &BatchbufferDecoder::decode_3dstate_binding_table_pointers_gs;
+   m_elements[_3DSTATE_BINDING_TABLE_POINTERS_PS] =
+      &BatchbufferDecoder::decode_3dstate_binding_table_pointers_ps;
+
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS_VS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_vs;
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS_DS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_hs;
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS_HS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_ds;
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS_GS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_gs;
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS_PS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_ps;
+   m_elements[_3DSTATE_SAMPLER_STATE_POINTERS] =
+      &BatchbufferDecoder::decode_3dstate_sampler_state_pointers_gen6;
+
+   m_elements[_3DSTATE_VIEWPORT_STATE_POINTERS_CC] =
+      &BatchbufferDecoder::decode_3dstate_viewport_state_pointers_cc;
+   m_elements[_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP] =
+      &BatchbufferDecoder::decode_3dstate_viewport_state_pointers_sf_clip;
+   m_elements[_3DSTATE_BLEND_STATE_POINTERS] =
+      &BatchbufferDecoder::decode_3dstate_blend_state_pointers;
+   m_elements[_3DSTATE_CC_STATE_POINTERS] =
+      &BatchbufferDecoder::decode_3dstate_cc_state_pointers;
+   m_elements[_3DSTATE_SCISSOR_STATE_POINTERS] =
+      &BatchbufferDecoder::decode_3dstate_scissor_state_pointers;
+}
+
+void
+BatchbufferDecoder::DetailedDecoder::
+decode(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &pfile,
+       const GPUCommand &data)
+{
+   static DetailedDecoder R;
+   std::map<uint32_t, fcn>::const_iterator iter;
+   uint32_t opcode;
+
+   opcode = gen_group_get_opcode(data.inst());
+   iter = R.m_elements.find(opcode);
+   if (iter != R.m_elements.end()) {
+      fcn function(iter->second);
+      (decoder->*function)(pfile, data);
+   }
+}
+
+//////////////////////////////////////////////
+// BatchRelocs methods
+void
+BatchRelocs::
+emit_reloc_data(BatchbufferLoggerOutput &pfile)
+{
+   pfile.begin_block("Relocs");
+   for(const auto &v : m_relocs) {
+
+      if(v.second.empty()) {
+         continue;
+      }
+
+      pfile.begin_block("Relocs on GEM");
+      pfile.print_value("GEM BO", "%u", v.first->handle());
+      for(const auto &w : v.second) {
+         pfile.begin_block("Reloc Entry");
+         pfile.print_value("Offset", "%0x012" PRIx64, w.first);
+         pfile.print_value("GPU Address", "%0x012" PRIx64, w.second);
+         pfile.end_block();
+      }
+      pfile.end_block();
+   }
+   pfile.end_block();
+}
+
+void
+BatchRelocs::
+place_relocation_values_into_buffer(const GEMBufferObject *gem, uint64_t gem_bo_offset,
+                                    std::vector<uint32_t> *dst) const
+{
+   reloc_map::const_iterator gem_iter;
+   reloc_map_of_gem_bo::const_iterator reloc_iter;
+   unsigned int dst_end;
+
+   gem_iter = m_relocs.find(gem);
+
+   if (gem_iter == m_relocs.end()) {
+      return;
+   }
+
+   dst_end = sizeof(uint32_t) * dst->size() + gem_bo_offset;
+
+   for(reloc_iter = gem_iter->second.lower_bound(gem_bo_offset);
+       reloc_iter != gem_iter->second.end() && reloc_iter->first < dst_end;
+       ++reloc_iter)
+   {
+      unsigned int s;
+      uint64_t addr;
+
+      addr = reloc_iter->second;
+
+      assert(reloc_iter->first >= gem_bo_offset);
+      s = reloc_iter->first - gem_bo_offset;
+
+      /* Recall that the locations in BatchRelocs are copied
+       * directly from the kernel and are in units of bytes,
+       * NOT DWORD's.
+       */
+      assert(s % sizeof(uint32_t) == 0);
+      s /= sizeof(uint32_t);
+      assert(s < dst->size());
+
+      (*dst)[s] = addr & 0xFFFFFFFF;
+      if (!m_32bit_gpu_addresses) {
+         assert(s + 1 < dst->size());
+         /* preserve the high 16 bits since the address
+          * is 48-bits wide and there may be additional
+          * data stashed in those highest 16-bits.
+          */
+         (*dst)[s + 1] &= 0xFFFF0000u;
+         (*dst)[s + 1] |= (addr >> 32u) & 0x0000FFFFu;
+      }
+   }
+}
+
+uint64_t
+BatchRelocs::
+get_gpu_address(const GEMBufferObject *q, uint64_t dword_offset,
+                const uint32_t *p, bool ignore_lower_12_bits) const
+{
+   reloc_map::const_iterator gem_iter;
+
+   uint64_t addr = p[0];
+   if (!m_32bit_gpu_addresses) {
+      /* On BDW and above, the address is 48-bits wide, consuming
+       * an aditional _32_ bits. Grab the next 16-bits of the
+       * address
+       */
+      addr |= uint64_t(p[1] & 0xFFFF) << uint64_t(32);
+   }
+
+   gem_iter = m_relocs.find(q);
+   if (gem_iter != m_relocs.end()) {
+      reloc_map_of_gem_bo::const_iterator reloc_iter;
+      reloc_iter = gem_iter->second.find(sizeof(uint32_t) * dword_offset);
+      if (reloc_iter != gem_iter->second.end()) {
+         addr = reloc_iter->second;
+      }
+   }
+
+   /* Address are to be page aligned (i.e. last 12 bits are zero),
+    * but HW commands might stash extra data in those 12-bits,
+    * zero those bits out.
+    */
+   return ignore_lower_12_bits ?
+      addr & ~uint64_t(0xFFFu) :
+      addr;
+}
+
+///////////////////////////////////////////////
+// BatchbufferDecoder methods
+BatchbufferDecoder::
+BatchbufferDecoder(enum decode_level_t decode_level,
+                   enum print_reloc_level_t print_reloc_level,
+                   uint32_t shader_decode_flags,
+                   struct gen_spec *spec, struct gen_disasm *dis,
+                   int pciid, GEMBufferTracker *tracker,
+                   ShaderFileList *shader_filelist,
+                   struct drm_i915_gem_execbuffer2 *execbuffer2):
+   m_decode_level(decode_level),
+   m_print_reloc_level(print_reloc_level),
+   m_shader_decode_flags(shader_decode_flags),
+   m_spec(spec),
+   m_gen_disasm(dis),
+   m_pci_id(pciid),
+   m_tracker(tracker),
+   m_shader_filelist(shader_filelist),
+   m_buffers(execbuffer2->buffer_count),
+   m_reloc_handles_are_indices(execbuffer2->flags & I915_EXEC_HANDLE_LUT),
+   m_gpu_state(m_tracker->fetch_hw_context(execbuffer2->rsvd1)),
+   m_relocs(spec),
+   m_execbuffer2(execbuffer2)
+{
+   struct drm_i915_gem_exec_object2 *exec_objects;
+
+   exec_objects = (struct drm_i915_gem_exec_object2 *) (uintptr_t) execbuffer2->buffers_ptr;
+   for(unsigned int i = 0; i < execbuffer2->buffer_count; ++i) {
+      m_buffers[i] = m_tracker->fetch_gem_bo(exec_objects[i].handle);
+   }
+
+   if (execbuffer2->flags & I915_EXEC_BATCH_FIRST) {
+      m_batchbuffer = m_buffers.front();
+   } else {
+      m_batchbuffer = m_buffers.back();
+   }
+
+   m_batchbuffer_log = m_tracker->fetch_or_create(nullptr, m_batchbuffer->handle());
+   assert(m_batchbuffer_log);
+   for(unsigned int i = 0; i < execbuffer2->buffer_count; ++i) {
+      std::pair<bool, GEMBufferObject*> q;
+      std::ostringstream pstr;
+
+      q = tracker->update_gem_bo_gpu_address(&exec_objects[i]);
+      if (!q.second) {
+         continue;
+      }
+
+      /* Bah humbug; the kernel interface does not state that
+       * the address values in a batchbuffer will get updated;
+       * The upshot is that we then need to examine the reloc
+       * data of the ioctl call.
+       */
+      struct drm_i915_gem_relocation_entry *reloc_entries;
+
+      reloc_entries = (struct drm_i915_gem_relocation_entry*) exec_objects[i].relocs_ptr;
+      for (unsigned int r = 0; r < exec_objects[i].relocation_count; ++r) {
+         uint32_t gem_bo_handle;
+         GEMBufferObject *bo;
+         uint64_t gpu_address;
+
+         gem_bo_handle = reloc_entries[r].target_handle;
+         if (execbuffer2->flags & I915_EXEC_HANDLE_LUT) {
+            gem_bo_handle = exec_objects[gem_bo_handle].handle;
+         }
+
+         bo = m_tracker->fetch_gem_bo(gem_bo_handle);
+         if (!bo) {
+            continue;
+         }
+
+         gpu_address = bo->gpu_address_begin() + reloc_entries[r].delta;
+         /* When reading from BO q an address at offset, we will
+          * read the gpu_address below.
+          */
+         m_relocs.add_entry(q.second, reloc_entries[r].offset, gpu_address);
+      }
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_shader(BatchbufferLoggerOutput &pfile, enum shader_decode_entry_t tp,
+              uint64_t gpu_address)
+{
+   const void *shader;
+   GPUAddressQuery query;
+   static const char *labels[shader_decode_entry_count] = {
+      [shader_decode_vs] = "Vertex Shader",
+      [shader_decode_hs] = "Hull (tessellation control) Shader",
+      [shader_decode_ds] = "Domain (tessellation evalulation) Shader",
+      [shader_decode_gs] = "Geometry Shader",
+      [shader_decode_ps_8] = "8-Pixel Shader",
+      [shader_decode_ps_16] = "16-Pixel Shader",
+      [shader_decode_ps_32] = "32-Pixel Shader",
+      [shader_decode_media_compute] = "Media/Compute Shader",
+   };
+
+   pfile.begin_block(labels[tp]);
+
+   shader = m_tracker->cpu_mapped<void>(gpu_address, &query);
+   pfile.print_value("GPU Address", "0x%012" PRIx64, gpu_address);
+   if (shader && query.m_gem_bo) {
+      if (m_shader_decode_flags & (1u << tp)) {
+         pfile.functor_print_value("Assembly",
+                                   std::bind(gen_disasm_disassemble,
+                                             m_gen_disasm,
+                                             shader, 0,
+                                             std::placeholders::_1),
+                                   true);
+      } else {
+         const char *filename;
+         filename = m_shader_filelist->filename(pfile.filename(), shader,
+                                                m_pci_id, m_gen_disasm);
+         if (filename) {
+            pfile.print_value("ShaderFile", "%s", filename);
+         }
+      }
+   } else {
+      pfile.print_value("GPU Address", "0x%012 (BAD)" PRIx64, gpu_address);
+   }
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+emit_log(BatchbufferLoggerOutput &pfile)
+{
+   assert(m_batchbuffer_log);
+
+   bool print_ioctl;
+
+   print_ioctl = m_batchbuffer_log->emit_log(this, pfile, m_execbuffer2->batch_len / 4);
+
+   /* Only print the drmIoctl command details if
+    * the last APIStartCallMarker was printable
+    */
+   if (print_ioctl && pfile) {
+      pfile.begin_block("drmIoctl(execbuffer2)");
+      pfile.print_value("length", "%d bytes", m_execbuffer2->batch_len);
+      pfile.print_value("length", "%d dwords", m_execbuffer2->batch_len / 4);
+      pfile.print_value("start", "%d", m_execbuffer2->batch_start_offset);
+      pfile.print_value("fd", "%d", m_batchbuffer_log->src()->fd);
+      pfile.print_value("GEM BO", "%u", m_batchbuffer_log->src()->gem_bo);
+
+      if (m_print_reloc_level >= print_reloc_gem_gpu_updates) {
+         m_relocs.emit_reloc_data(pfile);
+      }
+      pfile.end_block();
+   }
+}
+
+
+void
+BatchbufferDecoder::
+decode_media_interface_descriptor_load(BatchbufferLoggerOutput &pfile, const GPUCommand &data)
+{
+   struct gen_group *grp;
+   uint64_t gpu_address;
+
+   grp = gen_spec_find_struct(m_spec, "INTERFACE_DESCRIPTOR_DATA");
+   if (!grp) {
+      return;
+   }
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + data[3];
+   for(int i = 0, length = data[2] / 32; i < length; ++i,
+          gpu_address += 8 * sizeof(uint32_t)) {
+      GPUAddressQuery address_query(m_tracker->get_memory_at_gpu_address(gpu_address));
+      GPUCommand descriptor(address_query, m_spec, grp);
+      uint64_t shader_gpu_address;
+      int tmp, binding_table_count, sampler_count;
+
+      pfile.begin_block_value("Descriptor", "#%d", i);
+      pfile.print_value("GPU Address", "%012" PRIx64, gpu_address);
+      decode_gen_group(pfile, descriptor.gem_bo(), descriptor.dword_offset(),
+                       descriptor.contents_ptr(), descriptor.inst());
+
+      shader_gpu_address = m_gpu_state.ctx().m_latch_state.m_instruction_base_address + descriptor[0];
+
+      /* ISSUE: When decoding from UFO, we get crashes on Media/Compute
+       * shader decode from within gen_disasm_disassemble().
+       */
+      decode_shader(pfile, shader_decode_media_compute, shader_gpu_address);
+
+      sampler_count = -1;
+      if (descriptor.extract_field_value<int>("Sampler Count", &tmp)) {
+         sampler_count = 4 * tmp;
+      }
+
+      binding_table_count = -1;
+      if (descriptor.extract_field_value<int>("Binding Table Entry Count", &tmp)) {
+         binding_table_count = tmp;
+      }
+
+      decode_3dstate_sampler_state_pointers_helper(pfile,
+                                                   descriptor[3] & ~0x1f,
+                                                   sampler_count);
+      decode_3dstate_binding_table_pointers(pfile, "MEDIA",
+                                            descriptor[4] & ~0x1f,
+                                            binding_table_count);
+
+      pfile.end_block();
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_xs(BatchbufferLoggerOutput &pfile, const GPUCommand &data)
+{
+   bool has_shader(false);
+   uint64_t offset(0), gpu_address;
+   uint32_t opcode;
+   enum shader_decode_entry_t shader_tp;
+
+   data.extract_field_value<bool>("Enable", &has_shader);
+   has_shader = has_shader
+      && data.extract_field_value<uint64_t>("Kernel Start Pointer", &offset);
+
+   if(!has_shader) {
+      return;
+   }
+
+   opcode = gen_group_get_opcode(data.inst());
+   switch(opcode) {
+   default:
+   case _3DSTATE_VS:
+      shader_tp = shader_decode_vs;
+      break;
+   case _3DSTATE_HS:
+      shader_tp = shader_decode_hs;
+      break;
+   case _3DSTATE_DS:
+      shader_tp = shader_decode_ds;
+      break;
+   case _3DSTATE_GS:
+      shader_tp = shader_decode_gs;
+      break;
+   }
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_instruction_base_address + offset;
+   decode_shader(pfile, shader_tp, gpu_address);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_ps(BatchbufferLoggerOutput &pfile, const GPUCommand &data)
+{
+   typedef std::pair<enum shader_decode_entry_t, const char*> decode_job;
+   std::vector<decode_job> decode_jobs;
+   bool has_8(false), has_16(false), has_32(false);
+   int numEnabled;
+   static const char *kernels[3] = {
+      "Kernel Start Pointer 0",
+      "Kernel Start Pointer 1",
+      "Kernel Start Pointer 2",
+   };
+
+   data.extract_field_value<bool>("8 Pixel Dispatch Enable", &has_8);
+   data.extract_field_value<bool>("16 Pixel Dispatch Enable", &has_16);
+   data.extract_field_value<bool>("32 Pixel Dispatch Enable", &has_32);
+
+   /* GEN is amusing at times, depending on what dispatches are enabled,
+    * which kernel is used for different dispatch modes changes.
+    *
+    * | 8-enabled | 16-enabled | 32-enabled | 8-shader | 16-shader | 32-shader |
+    * |  TRUE     |  FALSE     |  FALSE     | Kerenl0  |           |           |
+    * |  TRUE     |  TRUE      |  FALSE     | Kerenl0  | Kerenl2   |           |
+    * |  TRUE     |  TRUE      |  TRUE      | Kernel0  | Kerenl2   | Kernel1   |
+    * |  FALSE    |  TRUE      |  FALSE     |          | Kernal0   |           |
+    * |  FALSE    |  FALSE     |  TRUE      |          |           | Kernel0   |
+    * |  FALSE    |  TRUE      |  TRUE      |          | Kernel2   | Kernel1   |
+    *
+    * Atleast from the table, we can get a simple set or rules:
+    *  - 8-wide, it is enabled is alway at Kernel0
+    *  - if N-wide is the only one enabled, then it is at Kernel0
+    *  - if there are atleast 2-enables, then 16-wide is at 2 and 32-wide is at 1.
+    */
+   numEnabled = int(has_8) + int(has_16) + int(has_32);
+   if (has_8) {
+      decode_jobs.push_back(decode_job(shader_decode_ps_8, kernels[0]));
+   }
+
+   if (numEnabled > 1) {
+      if (has_16) {
+         decode_jobs.push_back(decode_job(shader_decode_ps_16, kernels[2]));
+      }
+      if (has_32) {
+         decode_jobs.push_back(decode_job(shader_decode_ps_32, kernels[1]));
+      }
+   } else {
+      if (has_16) {
+         decode_jobs.push_back(decode_job(shader_decode_ps_16, kernels[0]));
+      }
+      else if (has_32) {
+         decode_jobs.push_back(decode_job(shader_decode_ps_32, kernels[0]));
+      }
+   }
+
+   for (const decode_job &J : decode_jobs) {
+      uint64_t addr;
+      if (data.extract_field_value<uint64_t>(J.second, &addr)) {
+         addr += m_gpu_state.ctx().m_latch_state.m_instruction_base_address;
+         decode_shader(pfile, J.first, addr);
+      }
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_constant(BatchbufferLoggerOutput &pfile,
+                               const GPUCommand &data)
+{
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers_vs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   decode_3dstate_binding_table_pointers(pfile, "VS", data[1] & ~0x1fu,
+                                         m_gpu_state.ctx().m_latch_state.m_VS.m_binding_table_count);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers_ds(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   decode_3dstate_binding_table_pointers(pfile, "DS", data[1] & ~0x1fu,
+                                         m_gpu_state.ctx().m_latch_state.m_DS.m_binding_table_count);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers_hs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   decode_3dstate_binding_table_pointers(pfile, "HS", data[1] & ~0x1fu,
+                                         m_gpu_state.ctx().m_latch_state.m_HS.m_binding_table_count);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers_ps(BatchbufferLoggerOutput &pfile,
+                                                const GPUCommand &data)
+{
+   decode_3dstate_binding_table_pointers(pfile, "PS", data[1] & ~0x1fu,
+                                         m_gpu_state.ctx().m_latch_state.m_PS.m_binding_table_count);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers_gs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   decode_3dstate_binding_table_pointers(pfile, "GS", data[1] & ~0x1fu,
+                                         m_gpu_state.ctx().m_latch_state.m_GS.m_binding_table_count);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_binding_table_pointers(BatchbufferLoggerOutput &pfile,
+                                      const std::string &label, uint32_t offset,
+                                      int cnt)
+{
+   struct gen_group *surface_state;
+   uint64_t gpu_address;
+   GPUAddressQuery Q;
+   const uint32_t *v;
+
+   /* The command is essentially just provides an address (given as an
+    * offset from surface_state_base_address) of a sequence of values V.
+    * That sequence of values V is just a sequence of offsets also from
+    * surface_state_base_address which is the location of the surface
+    * state values.
+    */
+   surface_state = gen_spec_find_struct(m_spec, "RENDER_SURFACE_STATE");
+   gpu_address = offset + m_gpu_state.ctx().m_latch_state.m_surface_state_base_address;
+   v = m_tracker->cpu_mapped<uint32_t>(gpu_address, &Q);
+
+   if (!Q.m_gem_bo || !surface_state) {
+      return;
+   }
+
+   pfile.begin_block_value("Binding Tables", "%s", label.c_str());
+
+   /* i965 driver does "Track-ish" the number of binding table entries in
+    * each program stage, the value of X.base.binding_table.size_bytes /4
+    * is the number of entries for a stage X where X is brw->wm,
+    * brw->vs, brw->gs, brw->tcs and brw->tes
+    */
+   if (cnt < 0) {
+      cnt = 16;
+      pfile.print_value("Count", "%d (Guessing)", cnt);
+   } else {
+      pfile.print_value("Count", "%d", cnt);
+   }
+
+   for (int i = 0; i < cnt; ++i) {
+      uint64_t state_gpu_address;
+      const uint32_t *state_ptr;
+      GPUAddressQuery SQ;
+
+      if (v[i] == 0) {
+         continue;
+      }
+
+      pfile.begin_block_value("Binding Table", "#%d", i);
+      pfile.print_value("offset", "%u", v[i]);
+
+      state_gpu_address = v[i] + m_gpu_state.ctx().m_latch_state.m_surface_state_base_address;
+      state_ptr = m_tracker->cpu_mapped<uint32_t>(state_gpu_address, &SQ);
+      if (!SQ.m_gem_bo) {
+         pfile.print_value("GPU address", "0x%012 (BAD)" PRIx64, state_gpu_address);
+         pfile.end_block();
+         continue;
+      }
+
+      pfile.print_value("GPU address", "0x%012 " PRIx64, state_gpu_address);
+      decode_gen_group(pfile, SQ.m_gem_bo, SQ.m_offset_into_gem_bo, state_ptr, surface_state);
+
+      pfile.end_block();
+   }
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_vs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   int cnt;
+   cnt = m_gpu_state.ctx().m_latch_state.m_VS.m_sampler_count;
+   decode_3dstate_sampler_state_pointers_helper(pfile, data[1], cnt);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_gs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   int cnt;
+   cnt = m_gpu_state.ctx().m_latch_state.m_GS.m_sampler_count;
+   decode_3dstate_sampler_state_pointers_helper(pfile, data[1], cnt);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_hs(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   int cnt;
+   cnt = m_gpu_state.ctx().m_latch_state.m_HS.m_sampler_count;
+   decode_3dstate_sampler_state_pointers_helper(pfile, data[1], cnt);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_ds(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   int cnt;
+   cnt = m_gpu_state.ctx().m_latch_state.m_DS.m_sampler_count;
+   decode_3dstate_sampler_state_pointers_helper(pfile, data[1], cnt);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_ps(BatchbufferLoggerOutput &pfile,
+                                         const GPUCommand &data)
+{
+   int cnt;
+   cnt = m_gpu_state.ctx().m_latch_state.m_PS.m_sampler_count;
+   decode_3dstate_sampler_state_pointers_helper(pfile, data[1], cnt);
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_gen6(BatchbufferLoggerOutput &pfile,
+                                           const GPUCommand &data)
+{
+   int sample_counts[3] = {
+      m_gpu_state.ctx().m_latch_state.m_VS.m_sampler_count,
+      m_gpu_state.ctx().m_latch_state.m_GS.m_sampler_count,
+      m_gpu_state.ctx().m_latch_state.m_PS.m_sampler_count
+   };
+
+   for (unsigned int stage = 0; stage < 3; ++stage) {
+      int cnt;
+      cnt = sample_counts[stage];
+      decode_3dstate_sampler_state_pointers_helper(pfile, data[stage + 1], cnt);
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_sampler_state_pointers_helper(BatchbufferLoggerOutput &pfile,
+                                             uint32_t offset, int cnt)
+{
+   struct gen_group *g;
+   uint64_t gpu_address;
+
+   g = gen_spec_find_struct(m_spec, "SAMPLER_STATE");
+   pfile.begin_block("SAMPLER_STATEs");
+
+   if (cnt < 0) {
+      cnt = 4;
+      pfile.print_value("Count", "%d (Guessing)", cnt);
+   } else {
+      pfile.print_value("Count", "%d", cnt);
+   }
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + offset;
+   for (int i = 0; i < cnt; ++i) {
+      pfile.begin_block_value("SamplerState", "#%d", i);
+      decode_pointer_helper(pfile, g, gpu_address);
+      pfile.end_block();
+   }
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_viewport_state_pointers_cc(BatchbufferLoggerOutput &pfile,
+                                          const GPUCommand &data)
+{
+   uint64_t gpu_address;
+   struct gen_group *g;
+
+   g = gen_spec_find_struct(m_spec, "CC_VIEWPORT");
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + (data[1] & ~0x1fu);
+
+   pfile.begin_block("CC_VIEWPORTs");
+
+   uint32_t cnt;
+   if (m_gpu_state.ctx().m_latch_state.m_VIEWPORT_count < 0) {
+      cnt = 4;
+      pfile.print_value("Count", "%d (Guessing)", cnt);
+   } else {
+      cnt = m_gpu_state.ctx().m_latch_state.m_VIEWPORT_count;
+      pfile.print_value("Count", "%d", cnt);
+   }
+
+   for (uint32_t i = 0; i < cnt; ++i) {
+      pfile.begin_block_value("CC-Viewport", "#%d", i);
+      decode_pointer_helper(pfile, g, gpu_address + i * 8);
+      pfile.end_block();
+   }
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_viewport_state_pointers_sf_clip(BatchbufferLoggerOutput &pfile,
+                                               const GPUCommand &data)
+{
+   uint64_t gpu_address;
+   struct gen_group *g;
+
+   g = gen_spec_find_struct(m_spec, "SF_CLIP_VIEWPORT");
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + (data[1] & ~0x3fu);
+
+   pfile.begin_block("SF_CLIP_VIEWPORTs");
+
+   uint32_t cnt;
+   if (m_gpu_state.ctx().m_latch_state.m_VIEWPORT_count < 0) {
+      cnt = 4;
+      pfile.print_value("Count", "%d (Guessing)", cnt);
+   } else {
+      cnt = m_gpu_state.ctx().m_latch_state.m_VIEWPORT_count;
+      pfile.print_value("Count", "%d", cnt);
+   }
+
+   for (uint32_t i = 0; i < cnt; ++i) {
+      pfile.begin_block_value("Viewport", "#%d", i);
+      decode_pointer_helper(pfile, g, gpu_address + i * 64);
+      pfile.end_block();
+   }
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_blend_state_pointers(BatchbufferLoggerOutput &pfile,
+                                    const GPUCommand &data)
+{
+   uint64_t gpu_address;
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + (data[1] & ~0x3fu);
+   pfile.begin_block("BLEND_STATE");
+   decode_pointer_helper(pfile, "BLEND_STATE", gpu_address);
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_cc_state_pointers(BatchbufferLoggerOutput &pfile,
+                                 const GPUCommand &data)
+{
+   uint64_t gpu_address;
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + (data[1] & ~0x3fu);
+   pfile.begin_block("COLOR_CALC_STATE");
+   decode_pointer_helper(pfile, "COLOR_CALC_STATE", gpu_address);
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_3dstate_scissor_state_pointers(BatchbufferLoggerOutput &pfile,
+                                      const GPUCommand &data)
+{
+   uint64_t gpu_address;
+
+   gpu_address = m_gpu_state.ctx().m_latch_state.m_dynamic_state_base_address + (data[1] & ~0x1fu);
+   pfile.begin_block("SCISSOR_RECT");
+   decode_pointer_helper(pfile, "SCISSOR_RECT", gpu_address);
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_pointer_helper(BatchbufferLoggerOutput &pfile,
+                      const char *instruction_name, uint64_t gpu_address)
+{
+   struct gen_group *g;
+
+   g = gen_spec_find_struct(m_spec, instruction_name);
+   if (g) {
+      pfile.print_value("Type", instruction_name);
+      decode_pointer_helper(pfile, g, gpu_address);
+   } else {
+      pfile.print_value("Unknown Type", "%s", instruction_name);
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_pointer_helper(BatchbufferLoggerOutput &pfile,
+                      struct gen_group *g, uint64_t gpu_address)
+{
+   const uint32_t *p;
+   GPUAddressQuery Q;
+
+   p = m_tracker->cpu_mapped<uint32_t>(gpu_address, &Q);
+   if (p) {
+      int len;
+      len = gen_group_get_length(g, p);
+
+      if (len < 0) {
+         pfile.print_value("BAD length", "%d", len);
+         return;
+      }
+
+      if (Q.m_offset_into_gem_bo + len > Q.m_gem_bo->size()) {
+         pfile.begin_block("Length to large");
+         pfile.print_value("length", "%d", len);
+         pfile.print_value("GEM BO offset", "%u", Q.m_offset_into_gem_bo);
+         pfile.print_value("GEM BO size", "%u", Q.m_gem_bo->size());
+         pfile.end_block();
+         return;
+      }
+   } else {
+      pfile.print_value("Bad GPU Address", "0x%012" PRIx64, gpu_address);
+      return;
+   }
+
+   decode_gen_group(pfile, Q.m_gem_bo, Q.m_offset_into_gem_bo, p, g);
+}
+
+void
+BatchbufferDecoder::
+decode_gen_group(BatchbufferLoggerOutput &pfile,
+                 const GEMBufferObject *q, uint64_t dword_offset,
+                 const uint32_t *p, struct gen_group *group)
+{
+   struct gen_field_iterator iter;
+
+   gen_field_iterator_init(&iter, group, p, false);
+
+   while (gen_field_iterator_next(&iter)) {
+      if (!is_header_field(group, iter.field)) {
+         if (iter.struct_desc) {
+            uint64_t struct_offset;
+            struct_offset = dword_offset + iter.dword;
+            pfile.begin_block_value(iter.name, "%s", iter.value);
+            decode_gen_group(pfile, q, struct_offset,
+                             p + iter.dword, iter.struct_desc);
+            pfile.end_block();
+         } else {
+            pfile.print_value(iter.name, "%s", iter.value);
+         }
+      }
+   }
+}
+
+void
+BatchbufferDecoder::
+decode_gpu_command(BatchbufferLoggerOutput &pfile, const GPUCommand &q)
+{
+   pfile.begin_block(gen_group_get_name(q.inst()));
+   decode_gen_group(pfile, q.gem_bo(), q.dword_offset(), q.contents_ptr(), q.inst());
+   DetailedDecoder::decode(this, pfile, q);
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+decode_gpu_execute_command(BatchbufferLoggerOutput &pfile, const GPUCommand &q)
+{
+   pfile.begin_block("Execute GPU command");
+   pfile.print_value("Command", "%s", gen_group_get_name(q.inst()));
+
+   decode_gpu_command(pfile, q);
+   pfile.begin_block("GPU State");
+   m_gpu_state.decode_contents(this, q.gpu_pipeline_type(), pfile);
+   pfile.end_block();
+
+   pfile.end_block();
+}
+
+void
+BatchbufferDecoder::
+process_gpu_command(bool printing_enabled, BatchbufferLoggerOutput &pfile,
+                    const GPUCommand &q)
+{
+   enum GPUCommand::gpu_command_type_t tp;
+
+   m_gpu_state.update_state(this, pfile, q);
+   tp = q.gpu_command_type();
+   switch (tp) {
+   case GPUCommand::gpu_command_show_value_with_gpu_state:
+      if (printing_enabled) {
+         decode_gpu_execute_command(pfile, q);
+      }
+      break;
+
+   case GPUCommand::gpu_command_show_value_without_gpu_state:
+      if (printing_enabled) {
+         decode_gen_group(pfile, q.gem_bo(), q.dword_offset(), q.contents_ptr(), q.inst());
+         DetailedDecoder::decode(this, pfile, q);
+      }
+      break;
+
+   default:
+      /* nothing */
+      break;
+   }
+}
+
+void
+BatchbufferDecoder::
+absorb_batchbuffer_contents(bool printing_enabled, BatchbufferLoggerOutput &dst,
+                            unsigned int start_dword, unsigned int end_dword)
+{
+   if (m_decode_level == no_decode || start_dword >= end_dword) {
+      return;
+   }
+
+   int length;
+
+   for (; start_dword < end_dword;  start_dword += length) {
+      GPUCommand q(m_batchbuffer, start_dword, m_spec);
+
+      length = std::max(1u, q.contents_size());
+      if (q.inst()) {
+         if (printing_enabled) {
+            std::ostringstream str;
+            dst.begin_block_value(gen_group_get_name(q.inst()), "%u", start_dword);
+         }
+
+         if (m_decode_level >= instruction_details_decode) {
+            process_gpu_command(printing_enabled, dst, q);
+         }
+
+         if (printing_enabled) {
+            dst.end_block();
+         }
+      } else if (printing_enabled) {
+         dst.begin_block_value("Unknown instruction", "%u (0x%08x)",
+                               start_dword, q[0]);
+         dst.end_block();
+      }
+   }
+}
+
+//////////////////////////////////
+// ShaderFileList methods
+const char*
+ShaderFileList::
+filename(const std::string &fileprefix, const void *shader,
+         int pciid, struct gen_disasm *gen_disasm)
+{
+   sha1_value key;
+   std::map<sha1_value, std::string>::iterator iter;
+   int shader_sz;
+
+   shader_sz = gen_disasm_assembly_length(gen_disasm, shader, 0);
+   _mesa_sha1_compute(shader, shader_sz, key.data());
+   iter = m_files.find(key);
+   if (iter != m_files.end()) {
+      return iter->second.c_str();
+   }
+
+   std::ostringstream str;
+   std::string filename;
+
+   str << fileprefix << "-shader_file#" << ++m_count
+       << ".pciid." << pciid << ".bin";
+   filename = str.str();
+
+   std::ofstream shader_file(filename.c_str(),
+                             std::ios_base::out | std::ios_base::binary);
+   if (!shader_file.is_open()) {
+      return nullptr;
+   }
+
+   shader_file.write(static_cast<const char*>(shader), shader_sz);
+   iter = m_files.insert(std::make_pair(key, filename)).first;
+
+   return iter->second.c_str();
+}
+//////////////////////////////////
+// BatchbufferLog methods
+void
+BatchbufferLog::
+add_ioctl_log_entry(const std::string &entry)
+{
+   if (!m_prints.empty()) {
+      m_prints.back().add_ioctl_log_entry(entry);
+   } else {
+      m_orphan_ioctl_log_entries.push_back(entry);
+   }
+}
+
+
+void
+BatchbufferLog::
+handle_batchbuffer_contents(bool printing_enabled,
+                            BatchbufferDecoder *decoder,
+                            BatchbufferLoggerOutput &dst,
+                            uint32_t start, uint32_t end)
+{
+   if (printing_enabled && dst) {
+      dst.begin_block_value("GPU commands", "[%u, %u)", start, end);
+      dst.print_value("dword start", "%u", start);
+      dst.print_value("dword end", "%u", end);
+      dst.print_value("dword length", "%u", end - start);
+   }
+
+   if (decoder) {
+      decoder->absorb_batchbuffer_contents(printing_enabled, dst,
+                                           start, end);
+   }
+
+   if (printing_enabled && dst) {
+      dst.end_block();
+   }
+}
+
+bool
+BatchbufferLog::
+emit_log(BatchbufferDecoder *decoder, BatchbufferLoggerOutput &dst,
+         uint32_t batchbuffer_len)
+{
+   uint32_t last_time(0);
+   unsigned int top_level(dst.current_block_level());
+   bool printing_enabled(false);
+
+   for(auto iter = m_prints_from_dummy.begin();
+       iter != m_prints_from_dummy.end(); ++iter) {
+      APIStartCallMarker &entry(*iter);
+      printing_enabled = entry.emit(entry.start_bb_location(), dst,
+                                    top_level);
+   }
+
+   APIStartCallMarker::print_ioctl_log(m_orphan_ioctl_log_entries, dst);
+
+   for(auto iter = m_prints.begin(); iter != m_prints.end(); ++iter) {
+      APIStartCallMarker &entry(*iter);
+      if (entry.start_bb_location() > last_time) {
+         /* We clear to 1 level, so that the batch-buffer decoding
+          * is a child element of the last APIStartCallMarker block;
+          * note that if m_prints_from_dummy was empty and entry is
+          * the first element in m_prints, then the GPU command decode
+          * block is a child of top_level instead.
+          */
+         dst.clear_block_stack(top_level + 1);
+         handle_batchbuffer_contents(printing_enabled, decoder, dst,
+                                     last_time, entry.start_bb_location());
+         last_time = entry.start_bb_location();
+      }
+
+      auto next_iter(iter);
+      uint32_t next_time;
+
+      ++next_iter;
+      next_time = (next_iter != m_prints.end()) ?
+         next_iter->start_bb_location() :
+         batchbuffer_len;
+      printing_enabled = entry.emit(next_time, dst, top_level);
+   }
+
+   /* close up all blocks we have left open */
+   if (dst) {
+      dst.clear_block_stack(top_level);
+   }
+
+   if (batchbuffer_len > last_time) {
+      handle_batchbuffer_contents(printing_enabled, decoder, dst,
+                                  last_time, batchbuffer_len);
+   }
+
+   return printing_enabled;
+}
+
+//////////////////////////////
+// GEMBufferTracker methods
+GEMBufferTracker::
+GEMBufferTracker(int fd):
+   m_fd(fd),
+   m_dummy_hw_ctx(0)
+{}
+
+GEMBufferTracker::
+~GEMBufferTracker()
+{
+   for(const auto &value : m_gem_bos_by_handle) {
+      delete value.second;
+   }
+}
+
+void
+GEMBufferTracker::
+emit_unemitted_log(BatchbufferLoggerOutput &dst)
+{
+   bool has_stuff_to_emit(false);
+
+   for (const auto &v : m_logs) {
+      if (!v.second.empty()) {
+         has_stuff_to_emit = true;
+         break;
+      }
+   }
+
+   if (!has_stuff_to_emit)
+      return;
+
+   dst.begin_block("UnemittedBatchbuffer");
+   dst.print_value("fd", "%d", m_fd);
+   for (auto &v : m_logs) {
+      if (!v.second.empty()) {
+         dst.begin_block_value("gem_bo", "%u", v.second.src()->gem_bo);
+         v.second.emit_log(nullptr, dst, 0);
+         dst.end_block();
+      }
+   }
+   dst.end_block();
+}
+
+void
+GEMBufferTracker::
+add_gem_bo(const struct drm_i915_gem_create &pdata)
+{
+   GEMBufferObject *p;
+   p = new GEMBufferObject(m_fd, pdata);
+   m_gem_bos_by_handle[pdata.handle] = p;
+}
+
+void
+GEMBufferTracker::
+add_gem_bo(const struct drm_i915_gem_userptr &pdata)
+{
+   GEMBufferObject *p;
+   p = new GEMBufferObject(m_fd, pdata);
+   m_gem_bos_by_handle[pdata.handle] = p;
+}
+
+void
+GEMBufferTracker::
+remove_gem_bo(uint32_t h)
+{
+   std::map<uint32_t, GEMBufferObject*>::const_iterator iter;
+   GEMBufferObject *p;
+
+   iter = m_gem_bos_by_handle.find(h);
+   if (iter != m_gem_bos_by_handle.end()) {
+      p = iter->second;
+      m_gem_bos_by_handle.erase(iter);
+      m_gem_bos_by_gpu_address_end.erase(p->gpu_address_end());
+      delete p;
+   }
+}
+
+GEMBufferObject*
+GEMBufferTracker::
+fetch_gem_bo(uint32_t h) const
+{
+   std::map<uint32_t, GEMBufferObject*>::const_iterator iter;
+   iter = m_gem_bos_by_handle.find(h);
+   return (iter != m_gem_bos_by_handle.end()) ?
+      iter->second :
+      nullptr;
+}
+
+void
+GEMBufferTracker::
+add_hw_context(const struct drm_i915_gem_context_create &create)
+{
+   uint32_t h;
+   h = create.ctx_id;
+   m_hw_contexts.insert(std::make_pair(h, i965HWContextData(h)));
+}
+
+void
+GEMBufferTracker::
+remove_hw_context(const struct drm_i915_gem_context_destroy &destroy)
+{
+   uint32_t h;
+   h = destroy.ctx_id;
+   m_hw_contexts.erase(h);
+}
+
+i965HWContextData*
+GEMBufferTracker::
+fetch_hw_context(uint32_t h)
+{
+   std::map<uint32_t, i965HWContextData>::iterator iter;
+   iter = m_hw_contexts.find(h);
+   if (iter != m_hw_contexts.end()) {
+      return &iter->second;
+   } else {
+      m_dummy_hw_ctx = i965HWContextData(0);
+      return &m_dummy_hw_ctx;
+   }
+}
+
+std::pair<bool, GEMBufferObject*>
+GEMBufferTracker::
+update_gem_bo_gpu_address(const struct drm_i915_gem_exec_object2 *p)
+{
+   std::map<uint32_t, GEMBufferObject*>::const_iterator iter;
+
+   iter = m_gem_bos_by_handle.find(p->handle);
+   if (iter == m_gem_bos_by_handle.end()) {
+      return std::make_pair(false, nullptr);
+   }
+
+   uint64_t old_gpu_address;
+   old_gpu_address = iter->second->gpu_address_begin();
+   if (old_gpu_address != p->offset) {
+      /* remove from m_gem_bos_by_gpu_address_end
+       * before updating
+       */
+      m_gem_bos_by_gpu_address_end.erase(iter->second->gpu_address_end());
+
+      /* Update GPU address of GEM BO */
+      iter->second->update_gpu_address(p->offset);
+
+      /* Place GEM BO into m_gem_bos_by_gpu_address_end */
+      uint64_t key;
+      key = iter->second->gpu_address_end();
+      m_gem_bos_by_gpu_address_end[key] = iter->second;
+      return std::make_pair(true, iter->second);
+   }
+
+   return std::make_pair(false, iter->second);
+}
+
+GPUAddressQuery
+GEMBufferTracker::
+get_memory_at_gpu_address(uint64_t address) const
+{
+   std::map<uint64_t, GEMBufferObject*>::const_iterator iter;
+   GPUAddressQuery return_value;
+
+   /* Get the first BO whose GPU end address is
+    * greater than address, thus iter->first > address
+    */
+   iter = m_gem_bos_by_gpu_address_end.upper_bound(address);
+   if (iter != m_gem_bos_by_gpu_address_end.end()
+       && iter->second->gpu_address_begin() <= address) {
+     return_value.m_gem_bo = iter->second;
+     return_value.m_offset_into_gem_bo =
+        address - iter->second->gpu_address_begin();
+   } else {
+     return_value.m_gem_bo = nullptr;
+     return_value.m_offset_into_gem_bo = 0uL;
+   }
+   return return_value;
+}
+
+template<typename T>
+const T*
+GEMBufferTracker::
+cpu_mapped(uint64_t gpu_address, GPUAddressQuery *q)
+{
+   GPUAddressQuery Q;
+
+   q = (q) ? q : &Q;
+   *q = get_memory_at_gpu_address(gpu_address);
+   if (q->m_gem_bo) {
+      const void *p;
+      p = q->m_gem_bo->cpu_mapped<uint8_t>() + q->m_offset_into_gem_bo;
+      return static_cast<const T*>(p);
+   } else {
+      return nullptr;
+   }
+}
+
+int
+GEMBufferTracker::
+pread_buffer(void *dst, uint64_t gpu_address, uint64_t size) const
+{
+   GPUAddressQuery q;
+   q = get_memory_at_gpu_address(gpu_address);
+
+   if (q.m_gem_bo
+       && q.m_gem_bo->gpu_address_begin() >= gpu_address
+       && q.m_gem_bo->gpu_address_end() < gpu_address) {
+      uint64_t offset;
+      offset = q.m_offset_into_gem_bo - gpu_address;
+      return q.m_gem_bo->pread_buffer(dst, offset, size);
+   } else {
+      return -1;
+   }
+}
+
+
+BatchbufferLog*
+GEMBufferTracker::
+fetch(uint32_t gem_handle)
+{
+   std::map<uint32_t, BatchbufferLog>::iterator iter;
+   iter = m_logs.find(gem_handle);
+   return (iter != m_logs.end()) ?
+      &iter->second:
+      nullptr;
+}
+
+BatchbufferLog*
+GEMBufferTracker::
+fetch_or_create(const void *bb, uint32_t h)
+{
+   BatchbufferLog *b;
+   b = fetch(h);
+
+   if (b == nullptr) {
+      std::map<uint32_t, BatchbufferLog>::iterator iter;
+      BatchbufferLog m(m_fd, bb, h);
+
+      iter = m_logs.insert(std::make_pair(h, m)).first;
+      b = &iter->second;
+   }
+
+   return b;
+}
+
+void
+GEMBufferTracker::
+remove_batchbuffer_log(const BatchbufferLog *q)
+{
+   assert(q != nullptr);
+   assert(q == fetch(q->src()->gem_bo));
+   m_logs.erase(q->src()->gem_bo);
+}
+
+///////////////////////////////
+// BatchbufferLogger methods
+BatchbufferLogger::
+BatchbufferLogger(void):
+   m_start_log_call_number(read_from_environment<unsigned int>("i965_INSTR_START", 0)),
+   m_end_log_call_number(read_from_environment<unsigned int>("i965_INSTR_END", ~0u)),
+   m_max_file_size(read_from_environment<unsigned int>("i965_INSTR_FILE_SIZE", 1 << 28)),
+   m_batchbuffer_state(default_batchbuffer_state_fcn),
+   m_active_batchbuffer(default_active_batchbuffer_fcn),
+   m_gen_spec(nullptr),
+   m_gen_disasm(nullptr),
+   m_dummy(-1, nullptr, ~0u),
+   m_number_aborted_batchbuffers(0)
+{
+   aborted_batchbuffer = aborted_batchbuffer_fcn;
+   release_driver = release_driver_fcn;
+
+   pre_call = pre_call_fcn;
+   post_call = post_call_fcn;
+   begin_logging = begin_logging_fcn;
+   end_logging = end_logging_fcn;
+   release_app = release_app_fcn;
+
+   std::string decode_level_str;
+   decode_level_str =
+      read_from_environment<std::string>("I965_DECODE_LEVEL",
+                                         "instruction_details_decode");
+
+   if (decode_level_str == "no_decode") {
+      m_decode_level = BatchbufferDecoder::no_decode;
+   } else if (decode_level_str == "instruction_decode") {
+      m_decode_level = BatchbufferDecoder::instruction_decode;
+   } else {
+      m_decode_level = BatchbufferDecoder::instruction_details_decode;
+   }
+
+   decode_level_str =
+      read_from_environment<std::string>("I965_PRINT_RELOC_LEVEL",
+                                         "print_reloc_nothing");
+   if (decode_level_str == "print_reloc_gem_gpu_updates") {
+      m_print_reloc_level = BatchbufferDecoder::print_reloc_gem_gpu_updates;
+   } else {
+      m_print_reloc_level = BatchbufferDecoder::print_reloc_nothing;
+   }
+
+   m_shader_decode_flags = 0u;
+   if (read_from_environment<int>("I965_DECODE_VS", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_vs);
+   }
+   if (read_from_environment<int>("I965_DECODE_HS", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_hs);
+   }
+   if (read_from_environment<int>("I965_DECODE_DS", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_ds);
+   }
+   if (read_from_environment<int>("I965_DECODE_GS", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_gs);
+   }
+   if (read_from_environment<int>("I965_DECODE_PS8", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_ps_8);
+   }
+   if (read_from_environment<int>("I965_DECODE_PS16", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_ps_16);
+   }
+   if (read_from_environment<int>("I965_DECODE_PS32", 1)) {
+      m_shader_decode_flags |= (1u << BatchbufferDecoder::shader_decode_ps_32);
+   }
+   if (read_from_environment<int>("I965_DECODE_CS", 1)) {
+      m_shader_decode_flags |=
+         (1u << BatchbufferDecoder::shader_decode_media_compute);
+   }
+}
+
+BatchbufferLogger::
+~BatchbufferLogger()
+{
+   for (const auto &v : m_gem_buffer_trackers) {
+      v.second->emit_unemitted_log(m_file);
+      delete v.second;
+   }
+
+   if (!m_dummy.empty() && m_file) {
+      m_file.begin_block("Logs not associated to batchbuffer");
+      m_dummy.emit_log(nullptr, m_file, 0);
+      m_file.end_block();
+   }
+
+   if (m_gen_disasm) {
+      gen_disasm_destroy(m_gen_disasm);
+   }
+}
+
+void
+BatchbufferLogger::
+aborted_batchbuffer_fcn(struct i965_batchbuffer_logger *pthis,
+                        int fd, uint32_t gem_bo)
+{
+   BatchbufferLogger *R;
+   R = static_cast<BatchbufferLogger*>(pthis);
+
+   R->m_mutex.lock();
+
+   BatchbufferLog *bb;
+   bb = R->fetch_batchbuffer_log(fd, gem_bo);
+   if (bb) {
+      if(R->m_file && !bb->empty()) {
+         ++R->m_number_aborted_batchbuffers;
+         R->m_file.begin_block_value("Aborted batchbuffer", "#%d",
+                                     R->m_number_aborted_batchbuffers);
+         R->m_file.print_value("fd", "%d", bb->src()->fd);
+         R->m_file.print_value("gem_bo", "%u", bb->src()->gem_bo);
+         bb->emit_log(nullptr, R->m_file, 0);
+         R->m_file.end_block();
+      }
+      R->gem_buffer_tracker(bb->src()->fd)->remove_batchbuffer_log(bb);
+   }
+
+   R->m_mutex.unlock();
+}
+
+void
+BatchbufferLogger::
+release_driver_fcn(struct i965_batchbuffer_logger *pthis)
+{
+   release();
+}
+
+void
+BatchbufferLogger::
+pre_call_fcn(struct i965_batchbuffer_logger_app *pthis,
+             unsigned int call_id,
+             const char *call_detailed,
+             const char *fcn_name)
+{
+   BatchbufferLogger *R;
+   BatchbufferLog *bb;
+   uint32_t time_of_print(0);
+
+   R = static_cast<BatchbufferLogger*>(pthis);
+
+   R->m_mutex.lock();
+   bb = R->fetch_or_create_batchbuffer_log();
+   if (bb != &R->m_dummy) {
+      time_of_print = R->m_batchbuffer_state(bb->src());
+   }
+   bb->add_call_marker(R->m_file, R->m_dummy, call_id, fcn_name,
+                       call_detailed, time_of_print);
+   R->m_mutex.unlock();
+}
+
+void
+BatchbufferLogger::
+post_call_fcn(struct i965_batchbuffer_logger_app *pthis,
+              unsigned int call_id)
+{
+}
+
+void
+BatchbufferLogger::
+begin_logging_fcn(struct i965_batchbuffer_logger_app *pthis,
+                          const char *name)
+{
+   BatchbufferLogger *R;
+   R = static_cast<BatchbufferLogger*>(pthis);
+
+   R->m_mutex.lock();
+   R->m_file.open(name);
+   R->m_shader_filelist.clear();
+   R->m_mutex.unlock();
+}
+
+void
+BatchbufferLogger::
+end_logging_fcn(struct i965_batchbuffer_logger_app *pthis)
+{
+   BatchbufferLogger *R;
+   R = static_cast<BatchbufferLogger*>(pthis);
+
+   R->m_mutex.lock();
+   /* We need to emit all the data of batchbuffers with a log */
+   for(const auto &v: R->m_gem_buffer_trackers) {
+      v.second->emit_unemitted_log(R->m_file);
+   }
+   R->m_file.close();
+   R->m_mutex.unlock();
+}
+
+void
+BatchbufferLogger::
+release_app_fcn(struct i965_batchbuffer_logger_app *pthis)
+{
+   release();
+}
+
+GEMBufferTracker*
+BatchbufferLogger::
+gem_buffer_tracker(int fd)
+{
+   GEMBufferTracker *q;
+   std::map<int, GEMBufferTracker*>::iterator iter;
+
+   iter = m_gem_buffer_trackers.find(fd);
+   if (iter != m_gem_buffer_trackers.end()) {
+      q = iter->second;
+   } else {
+      q = new GEMBufferTracker(fd);
+      m_gem_buffer_trackers[fd] = q;
+   }
+
+   return q;
+}
+
+void
+BatchbufferLogger::
+pre_process_ioctl(int fd, unsigned long request, void *argp)
+{
+   m_mutex.lock();
+}
+
+void
+BatchbufferLogger::
+post_process_ioctl(int ioctl_return_code, int fd, unsigned long request,
+                   void *argp)
+{
+   if (ioctl_return_code == -1) {
+      m_mutex.unlock();
+      return;
+   }
+
+   GEMBufferTracker *tracker;
+   BatchbufferLog *bb;
+   struct i965_logged_batchbuffer driver_bb;
+
+   tracker = gem_buffer_tracker(fd);
+   m_active_batchbuffer(&driver_bb);
+   if (driver_bb.fd == fd) {
+      bb = tracker->fetch_or_create(driver_bb.driver_data,
+                                    driver_bb.gem_bo);
+   } else {
+      bb = &m_dummy;
+   }
+
+   switch(request) {
+   case DRM_IOCTL_I915_GEM_CREATE: {
+      struct drm_i915_gem_create *create;
+
+      create = (struct drm_i915_gem_create*) argp;
+      tracker->add_gem_bo(*create);
+
+      std::ostringstream ostr;
+      ostr << "Create GEM BO fd = " << std::dec << fd
+           << ", size = " << create->size
+           << ", handle = " << create->handle;
+      bb->add_ioctl_log_entry(ostr.str());
+      break;
+   }
+
+   case DRM_IOCTL_I915_GEM_USERPTR: {
+      struct drm_i915_gem_userptr *create;
+
+      create = (struct drm_i915_gem_userptr*) argp;
+      tracker->add_gem_bo(*create);
+
+      std::ostringstream ostr;
+      ostr << "Create GEM BO-userptr fd = " << std::dec << fd
+           << ", user_size = " << create->user_size
+           << ", user_ptr = " << create->user_ptr
+           << ", handle = " << create->handle;
+      bb->add_ioctl_log_entry(ostr.str());
+      break;
+   }
+
+   case DRM_IOCTL_GEM_CLOSE: {
+      struct drm_gem_close *cmd;
+      std::ostringstream str;
+
+      cmd = (struct drm_gem_close *) argp;
+      tracker->remove_gem_bo(cmd->handle);
+
+      str << "Remove GEM BO fd = " << fd
+          << ", handle = " << cmd->handle;
+      bb->add_ioctl_log_entry(str.str());
+      break;
+   }
+
+   case DRM_IOCTL_I915_GEM_CONTEXT_CREATE: {
+      struct drm_i915_gem_context_create *create_hw_ctx;
+
+      create_hw_ctx = (struct drm_i915_gem_context_create*)argp;
+      tracker->add_hw_context(*create_hw_ctx);
+
+      std::ostringstream ostr;
+      ostr << "Create GEM HW context, fd = " << std::dec << fd
+           << ", handle = " << create_hw_ctx->ctx_id;
+      bb->add_ioctl_log_entry(ostr.str());
+      break;
+   }
+
+   case DRM_IOCTL_I915_GEM_CONTEXT_DESTROY: {
+      struct drm_i915_gem_context_destroy *destroy_hw_ctx;
+
+      destroy_hw_ctx = (struct drm_i915_gem_context_destroy*)argp;
+      tracker->remove_hw_context(*destroy_hw_ctx);
+
+      std::ostringstream ostr;
+      ostr << "Destroy GEM HW context, fd = " << std::dec << fd
+           << ", handle = " << destroy_hw_ctx->ctx_id;
+      bb->add_ioctl_log_entry(ostr.str());
+      break;
+   }
+
+   case DRM_IOCTL_I915_GEM_EXECBUFFER: {
+      //TODO:
+      break;
+   }
+
+   case DRM_IOCTL_I915_GEM_EXECBUFFER2:
+   case DRM_IOCTL_I915_GEM_EXECBUFFER2_WR: {
+      struct drm_i915_gem_execbuffer2 *execbuffer2 =
+         (struct drm_i915_gem_execbuffer2*) argp;
+      BatchbufferDecoder decoder(m_decode_level, m_print_reloc_level,
+                                 m_shader_decode_flags,
+                                 m_gen_spec, m_gen_disasm,
+                                 m_pci_id, tracker, &m_shader_filelist,
+                                 execbuffer2);
+
+      assert(decoder.batchbuffer_log());
+      decoder.emit_log(m_file);
+      tracker->remove_batchbuffer_log(decoder.batchbuffer_log());
+      break;
+   }
+
+   } //of switch(request)
+
+   m_mutex.unlock();
+}
+
+int
+BatchbufferLogger::
+local_drm_ioctl(int fd, unsigned long request, void *argp)
+{
+   int ret;
+
+   do {
+      ret = ioctl(fd, request, argp);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+static pthread_mutex_t i965_batchbuffer_logger_acquire_mutex =
+   PTHREAD_MUTEX_INITIALIZER;
+static int i965_batchbuffer_logger_acquire_ref_count = 0;
+static BatchbufferLogger *i965_batchbuffer_logger_object = nullptr;
+
+BatchbufferLogger*
+BatchbufferLogger::
+acquire(void)
+{
+   pthread_mutex_lock(&i965_batchbuffer_logger_acquire_mutex);
+
+   if (!i965_batchbuffer_logger_object) {
+      i965_batchbuffer_logger_object = new BatchbufferLogger();
+   }
+   ++i965_batchbuffer_logger_acquire_ref_count;
+
+   pthread_mutex_unlock(&i965_batchbuffer_logger_acquire_mutex);
+
+   return i965_batchbuffer_logger_object;
+}
+
+void
+BatchbufferLogger::
+release(void)
+{
+   pthread_mutex_lock(&i965_batchbuffer_logger_acquire_mutex);
+
+   --i965_batchbuffer_logger_acquire_ref_count;
+   if (i965_batchbuffer_logger_acquire_ref_count == 0) {
+      delete i965_batchbuffer_logger_object;
+      i965_batchbuffer_logger_object = nullptr;
+   }
+
+   pthread_mutex_unlock(&i965_batchbuffer_logger_acquire_mutex);
+}
+
+void
+BatchbufferLogger::
+set_driver_funcs(int pci_id,
+                 i965_logged_batchbuffer_state f1,
+                 i965_active_batchbuffer f2)
+{
+   int old_pci_id;
+
+   m_mutex.lock();
+   old_pci_id = m_pci_id;
+   m_batchbuffer_state = f1;
+   m_active_batchbuffer = f2;
+   m_pci_id = pci_id;
+   gen_get_device_info(m_pci_id, &m_dev_info);
+   m_gen_spec = gen_spec_load(&m_dev_info);
+
+   if (m_gen_disasm && old_pci_id != m_pci_id) {
+      gen_disasm_destroy(m_gen_disasm);
+      m_gen_disasm = nullptr;
+   }
+
+   if (m_gen_disasm == nullptr) {
+      m_gen_disasm = gen_disasm_create(m_pci_id);
+   }
+
+   m_mutex.unlock();
+}
+
+/* Replacing ioctl with like that found in aubdump of IGT
+ * does not work with apitrace; some of the ioctls are
+ * picked up, but not all. This appears to only happen on
+ * apitrace (and its glretrace program). I have no idea why
+ * replacing ioctl does not work, but replacing drmIoctl does
+ * work.
+ */
+extern "C"
+int
+drmIoctl(int fd, unsigned long request, void *arg)
+{
+  int return_value;
+
+  pthread_mutex_lock(&i965_batchbuffer_logger_acquire_mutex);
+
+  if (i965_batchbuffer_logger_object) {
+     i965_batchbuffer_logger_object->pre_process_ioctl(fd, request, arg);
+  }
+
+  return_value = BatchbufferLogger::local_drm_ioctl(fd, request, arg);
+
+  if (i965_batchbuffer_logger_object) {
+     i965_batchbuffer_logger_object->post_process_ioctl(return_value, fd,
+                                                        request, arg);
+  }
+
+  pthread_mutex_unlock(&i965_batchbuffer_logger_acquire_mutex);
+
+  return return_value;
+}
+
+//////////////////////////////////////////
+// exported symbols for application integration
+extern "C"
+struct i965_batchbuffer_logger_app*
+i965_batchbuffer_logger_app_acquire(void)
+{
+   BatchbufferLogger *R;
+   R = BatchbufferLogger::acquire();
+   return R;
+}
+
+///////////////////////////////////////////
+// exported symbols for 3D driver integration
+extern "C"
+struct i965_batchbuffer_logger*
+i965_batchbuffer_logger_acquire(int pci_id,
+                                i965_logged_batchbuffer_state f1,
+                                i965_active_batchbuffer f2)
+{
+   BatchbufferLogger *R;
+   R = BatchbufferLogger::acquire();
+   R->set_driver_funcs(pci_id, f1, f2);
+   return R;
+}
diff --git a/src/intel/tools/i965_batchbuffer_logger_instructions.h b/src/intel/tools/i965_batchbuffer_logger_instructions.h
new file mode 100644
index 0000000..ad3385d
--- /dev/null
+++ b/src/intel/tools/i965_batchbuffer_logger_instructions.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _i965_INSTRUMENTATION_INSTRUCTIONS_H_
+#define _i965_INSTRUMENTATION_INSTRUCTIONS_H_
+
+#define STATE_BASE_ADDRESS                  0x61010000
+
+#define MEDIA_INTERFACE_DESCRIPTOR_LOAD     0x70020000
+#define MEDIA_CURBE_LOAD                    0x70010000
+#define MEDIA_VFE_STATE                     0x70000000
+#define MEDIA_STATE_FLUSH                   0x70040000
+
+#define _3DSTATE_PIPELINE_SELECT            0x61040000
+#define _3DSTATE_PIPELINE_SELECT_GM45       0x69040000
+
+#define _3DSTATE_INDEX_BUFFER               0x780a0000
+#define _3DSTATE_VERTEX_BUFFERS             0x78080000
+
+#define _3DSTATE_VF_INSTANCING              0x78490000
+
+#define _3DSTATE_VS                         0x78100000
+#define _3DSTATE_GS                         0x78110000
+#define _3DSTATE_HS                         0x781b0000
+#define _3DSTATE_DS                         0x781d0000
+#define _3DSTATE_PS                         0x78200000
+
+#define _3D_STATE_CLIP                      0x78120000
+
+#define _3DSTATE_CONSTANT_VS                0x78150000
+#define _3DSTATE_CONSTANT_GS                0x78160000
+#define _3DSTATE_CONSTANT_PS                0x78170000
+#define _3DSTATE_CONSTANT_HS                0x78190000
+#define _3DSTATE_CONSTANT_DS                0x781A0000
+
+#define _3DSTATE_BINDING_TABLE_POINTERS_VS  0x78260000
+#define _3DSTATE_BINDING_TABLE_POINTERS_HS  0x78270000
+#define _3DSTATE_BINDING_TABLE_POINTERS_DS  0x78280000
+#define _3DSTATE_BINDING_TABLE_POINTERS_GS  0x78290000
+#define _3DSTATE_BINDING_TABLE_POINTERS_PS  0x782a0000
+
+#define _3DSTATE_SAMPLER_STATE_POINTERS_VS  0x782b0000
+#define _3DSTATE_SAMPLER_STATE_POINTERS_DS  0x782c0000
+#define _3DSTATE_SAMPLER_STATE_POINTERS_HS  0x782d0000
+#define _3DSTATE_SAMPLER_STATE_POINTERS_GS  0x782e0000
+#define _3DSTATE_SAMPLER_STATE_POINTERS_PS  0x782f0000
+#define _3DSTATE_SAMPLER_STATE_POINTERS     0x78020000
+
+#define _3DSTATE_VIEWPORT_STATE_POINTERS_CC 0x78230000
+#define _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP 0x78210000
+#define _3DSTATE_BLEND_STATE_POINTERS       0x78240000
+#define _3DSTATE_CC_STATE_POINTERS          0x780e0000
+#define _3DSTATE_SCISSOR_STATE_POINTERS     0x780f0000
+
+#define _MI_CMD_3D                          (0x3 << 29)
+#define _3DSTATE_PIPE_CONTROL               (_MI_CMD_3D | (3 << 27) | (2 << 24))
+
+#define _3DPRIMITIVE                        0x7b000000
+#define _GPGPU_WALKER                       0x71050000
+
+#define _MI_CMD                             (0x0 << 29)
+
+/* _MI's that set values of registers that we can (mostly)
+ * determine the value after the kernel returns the ioctl.
+ */
+#define _MI_LOAD_REGISTER_IMM		    (_MI_CMD | (34 << 23))
+#define _MI_LOAD_REGISTER_REG		    (_MI_CMD | (42 << 23))
+#define _MI_LOAD_REGISTER_MEM               (_MI_CMD | (41 << 23))
+#define _MI_STORE_REGISTER_MEM              (_MI_CMD | (36 << 23))
+
+/* _MI_'s that are commands, not all of these are allowed
+ * in an execlist
+ */
+#define _MI_NOOP                            (_MI_CMD | ( 0 << 23))
+#define _MI_BATCH_BUFFER_END                (_MI_CMD | (10 << 23))
+#define _MI_BATCH_BUFFER_START              (_MI_CMD | (49 << 23))
+#define _MI_ARB_CHECK                       (_MI_CMD | ( 5 << 23))
+#define _MI_ATOMIC                          (_MI_CMD | (47 << 23))
+#define _MI_CLFLUSH                         (_MI_CMD | (39 << 23))
+#define _MI_CONDITIONAL_BATCH_BUFFER_END    (_MI_CMD | (54 << 23))
+#define _MI_COPY_MEM_MEM                    (_MI_CMD | (46 << 23))
+#define _MI_DISPLAY_FLIP                    (_MI_CMD | (20 << 23))
+#define _MI_FORCE_WAKEUP                    (_MI_CMD | (29 << 23))
+#define _MI_LOAD_SCAN_LINES_EXCL            (_MI_CMD | (19 << 23))
+#define _MI_LOAD_SCAN_LINES_INCL            (_MI_CMD | (18 << 23))
+#define _MI_MATH                            (_MI_CMD | (26 << 23))
+#define _MI_REPORT_HEAD                     (_MI_CMD | ( 7 << 23))
+#define _MI_REPORT_PERF_COUNT               (_MI_CMD | (40 << 23))
+#define _MI_RS_CONTEXT                      (_MI_CMD | (15 << 23))
+#define _MI_RS_CONTROL                      (_MI_CMD | ( 6 << 23))
+#define _MI_RS_STORE_DATA_IMM               (_MI_CMD | (43 << 23))
+#define _MI_SEMAPHORE_SIGNAL                (_MI_CMD | (27 << 23))
+#define _MI_SEMAPHORE_WAIT                  (_MI_CMD | (28 << 23))
+#define _MI_SET_CONTEXT                     (_MI_CMD | (24 << 23))
+#define _MI_STORE_DATA_IMM                  (_MI_CMD | (32 << 23))
+#define _MI_STORE_DATA_INDEX                (_MI_CMD | (33 << 23))
+#define _MI_SUSPEND_FLUSH                   (_MI_CMD | (11 << 23))
+#define _MI_UPDATE_GTT                      (_MI_CMD | (35 << 23))
+#define _MI_USER_INTERRUPT                  (_MI_CMD | ( 2 << 23))
+#define _MI_WAIT_FOR_EVENT                  (_MI_CMD | ( 3 << 23))
+/* setting the predicate directly or via registers is viewed
+ * as a command and not state because the value to which it is set
+ * is not entirely determined by CPU values.
+ */
+#define _MI_SET_PREDICATE                   (_MI_CMD | ( 1 << 23))
+#define _MI_PREDICATE                       (_MI_CMD | (12 << 23))
+
+/* _MI_'s that set state value */
+#define _MI_TOPOLOGY_FILTER                 (_MI_CMD | 13 << 23)
+
+#endif
-- 
2.7.4