[Mesa-dev] [PATCH] anv: implement pipeline statistics queries

Wed Feb 22 05:07:25 UTC 2017

Hey Look.  I'm actually reading your patch now!

I read through the whole thing and over-all I think it looks fairly good.

On Sun, Nov 27, 2016 at 11:23 AM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:

> The strategy is to just keep n anv_query_pool_slot entries per query
> instead of one. The available bit is only valid in the last one.
>

Seems like a reasonable approach.  To be honest, I'm not a huge fan of the
"available" bit (or 64 bits as the case may be) but I'm not sure how we'd
get away without it.

Maybe it would be better to do something like:

struct anv_query_entry {
   uint64_t begin;
   uint64_t end;
};

struct anv_query_pool_slot {
   uint64_t available
   struct anv_query_entry entries[0];
};

Food for thought.

> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>
> I think this is in a pretty good state now. I've tested both the direct and
> buffer paths with a hacked up cube application, and I'm seeing
> non-ridiculous
> values for the various counters, although I haven't 100% verified them for
> accuracy.
>
> This also implements the hsw/bdw workaround for dividing frag invocations
> by 4,
> copied from hsw_queryobj. I tested this on SKL and it seem to divide the
> values
> as expected.
>
> The cube patch I've been testing with is at http://paste.debian.net/899374
> /
> You can flip between copying to a buffer and explicit retrieval by
> commenting
> out the relevant function calls.
>
>  src/intel/vulkan/anv_device.c      |   2 +-
>  src/intel/vulkan/anv_private.h     |   4 +
>  src/intel/vulkan/anv_query.c       |  99 ++++++++++----
>  src/intel/vulkan/genX_cmd_buffer.c | 260 ++++++++++++++++++++++++++++++
> ++-----
>  4 files changed, 308 insertions(+), 57 deletions(-)
>
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index 99eb73c..7ad1970 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures(
>        .textureCompressionASTC_LDR               = pdevice->info.gen >= 9,
> /* FINISHME CHV */
>        .textureCompressionBC                     = true,
>        .occlusionQueryPrecise                    = true,
> -      .pipelineStatisticsQuery                  = false,
> +      .pipelineStatisticsQuery                  = true,
>        .fragmentStoresAndAtomics                 = true,
>        .shaderTessellationAndGeometryPointSize   = true,
>        .shaderImageGatherExtended                = false,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private
> .h
> index 2fc543d..7271609 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1763,6 +1763,8 @@ struct anv_render_pass {
>     struct anv_subpass                           subpasses[0];
>  };
>
> +#define ANV_PIPELINE_STATISTICS_COUNT 11
> +
>  struct anv_query_pool_slot {
>     uint64_t begin;
>     uint64_t end;
> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot {
>  struct anv_query_pool {
>     VkQueryType                                  type;
>     uint32_t                                     slots;
> +   uint32_t                                     pipeline_statistics;
> +   uint32_t                                     slot_stride;
>     struct anv_bo                                bo;
>  };
>
> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c
> index 293257b..dc00859 100644
> --- a/src/intel/vulkan/anv_query.c
> +++ b/src/intel/vulkan/anv_query.c
> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool(
>     ANV_FROM_HANDLE(anv_device, device, _device);
>     struct anv_query_pool *pool;
>     VkResult result;
> -   uint32_t slot_size;
> -   uint64_t size;
> +   uint32_t slot_size = sizeof(struct anv_query_pool_slot);
>
+   uint32_t slot_stride = 1;
>

Strides are usually in bytes, not slots...

> +   uint64_t size = pCreateInfo->queryCount * slot_size;
>

Might make sense to move this to after we compute the slot_stride.

> +   uint32_t pipeline_statistics = 0;
>
>     assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_C
> REATE_INFO);
>
> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool(
>     case VK_QUERY_TYPE_TIMESTAMP:
>        break;
>     case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -      return VK_ERROR_INCOMPATIBLE_DRIVER;
> +      pipeline_statistics = pCreateInfo->pipelineStatistics &
> +         ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1);
> +      slot_stride = _mesa_bitcount(pipeline_statistics);
> +      size *= slot_stride;
> +      break;
>     default:
>        assert(!"Invalid query type");
> +      return VK_ERROR_INCOMPATIBLE_DRIVER;
>     }
>
> -   slot_size = sizeof(struct anv_query_pool_slot);
>     pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
>                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
>     if (pool == NULL)
> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool(
>
>     pool->type = pCreateInfo->queryType;
>     pool->slots = pCreateInfo->queryCount;
> +   pool->pipeline_statistics = pipeline_statistics;
> +   pool->slot_stride = slot_stride;
>
> -   size = pCreateInfo->queryCount * slot_size;
>     result = anv_bo_init_new(&pool->bo, device, size);
>     if (result != VK_SUCCESS)
>        goto fail;
> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool(
>     vk_free2(&device->alloc, pAllocator, pool);
>  }
>
> +static void *
> +store_query_result(void *pData, VkQueryResultFlags flags,
> +                   uint64_t result, uint64_t available)
> +{
> +   if (flags & VK_QUERY_RESULT_64_BIT) {
> +      uint64_t *dst = pData;
> +      *dst++ = result;
> +      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
> +         *dst++ = available;
> +      return dst;
> +   } else {
> +      uint32_t *dst = pData;
> +      if (result > UINT32_MAX)
> +         result = UINT32_MAX;
> +      *dst++ = result;
> +      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
> +         *dst++ = available;
> +      return dst;
> +   }
> +}
> +
>  VkResult anv_GetQueryPoolResults(
>      VkDevice                                    _device,
>      VkQueryPool                                 queryPool,
> @@ -112,6 +140,7 @@ VkResult anv_GetQueryPoolResults(
>     int ret;
>
>     assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
> +          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
>            pool->type == VK_QUERY_TYPE_TIMESTAMP);
>
>     if (pData == NULL)
> @@ -129,14 +158,42 @@ VkResult anv_GetQueryPoolResults(
>     void *data_end = pData + dataSize;
>     struct anv_query_pool_slot *slot = pool->bo.map;
>
> -   for (uint32_t i = 0; i < queryCount; i++) {
> +   for (uint32_t i = 0; i < queryCount && pData < data_end;
>

I think this condition is broken if dataSize is not a multiple of the query
slot stride.

> +        i++, pData += stride) {
> +      if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
> +         VkQueryResultFlags f = flags & ~VK_QUERY_RESULT_WITH_AVAILABI
> LITY_BIT;
> +         void *pos = pData;
> +         uint32_t pipeline_statistics = pool->pipeline_statistics;
> +         struct anv_query_pool_slot *base =
> +            &slot[(firstQuery + i) * pool->slot_stride];
> +
> +         while (pipeline_statistics) {
> +            uint32_t stat = u_bit_scan(&pipeline_statistics);
> +            uint64_t result = base->end - base->begin;
> +
> +            /* WaDividePSInvocationCountBy4:HSW,BDW */
> +            if ((device->info.gen == 8 || device->info.is_haswell) &&
> +                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FR
> AGMENT_SHADER_INVOCATIONS_BIT)
> +               result >>= 2;
> +
> +            pos = store_query_result(pos, f, result, 0);
>
+            base++;
> +         }
> +         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
> +            base--;
> +            if (flags & VK_QUERY_RESULT_64_BIT)
> +               *(uint64_t *)pos = base->available;
> +            else
> +               *(uint32_t *)pos = base->available;
>

Given what I'm reading here, I think my suggestion above about reworking
query_slot makes even more sense.

> +         }
> +         continue;
> +      }
> +
>        switch (pool->type) {
>        case VK_QUERY_TYPE_OCCLUSION: {
>           result = slot[firstQuery + i].end - slot[firstQuery + i].begin;
>           break;
>        }
> -      case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> -         unreachable("pipeline stats not supported");
>        case VK_QUERY_TYPE_TIMESTAMP: {
>           result = slot[firstQuery + i].begin;
>           break;
> @@ -145,23 +202,7 @@ VkResult anv_GetQueryPoolResults(
>           unreachable("invalid pool type");
>        }
>
> -      if (flags & VK_QUERY_RESULT_64_BIT) {
> -         uint64_t *dst = pData;
> -         dst[0] = result;
> -         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
> -            dst[1] = slot[firstQuery + i].available;
> -      } else {
> -         uint32_t *dst = pData;
> -         if (result > UINT32_MAX)
> -            result = UINT32_MAX;
> -         dst[0] = result;
> -         if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
> -            dst[1] = slot[firstQuery + i].available;
> -      }
> -
> -      pData += stride;
> -      if (pData >= data_end)
> -         break;
> +      store_query_result(pData, flags, result, slot[firstQuery +
> i].available);
>     }
>
>     return VK_SUCCESS;
> @@ -183,6 +224,14 @@ void anv_CmdResetQueryPool(
>           slot[firstQuery + i].available = 0;
>           break;
>        }
> +      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
> +         struct anv_query_pool_slot *slot = pool->bo.map;
> +
> +         slot = &slot[(firstQuery + i) * pool->slot_stride];
> +         for (uint32_t j = 0; j < pool->slot_stride; j++)
> +            slot[j].available = 0;
> +         break;
> +      }
>        default:
>           assert(!"Invalid query type");
>        }
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index a965cd6..1369ac2 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -2272,6 +2272,50 @@ emit_query_availability(struct anv_cmd_buffer
> *cmd_buffer,
>     }
>  }
>
> +#define IA_VERTICES_COUNT               0x2310
> +#define IA_PRIMITIVES_COUNT             0x2318
> +#define VS_INVOCATION_COUNT             0x2320
> +#define HS_INVOCATION_COUNT             0x2300
> +#define DS_INVOCATION_COUNT             0x2308
> +#define GS_INVOCATION_COUNT             0x2328
> +#define GS_PRIMITIVES_COUNT             0x2330
> +#define CL_INVOCATION_COUNT             0x2338
> +#define CL_PRIMITIVES_COUNT             0x2340
> +#define PS_INVOCATION_COUNT             0x2348
> +#define CS_INVOCATION_COUNT             0x2290
>

I think the "right" thing to do would be to add genxml for these.

> +
> +static const uint32_t PIPELINE_STAT_TO_REG[] = {
> +   IA_VERTICES_COUNT,
> +   IA_PRIMITIVES_COUNT,
> +   VS_INVOCATION_COUNT,
> +   GS_INVOCATION_COUNT,
> +   GS_PRIMITIVES_COUNT,
> +   CL_INVOCATION_COUNT,
> +   CL_PRIMITIVES_COUNT,
> +   PS_INVOCATION_COUNT,
> +   HS_INVOCATION_COUNT,
> +   DS_INVOCATION_COUNT,
> +   CS_INVOCATION_COUNT
> +};
> +
> +static void
> +emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
> +                   struct anv_bo *bo, uint32_t offset) {
> +   STATIC_ASSERT(ARRAY_SIZE(PIPELINE_STAT_TO_REG) ==
> +                 ANV_PIPELINE_STATISTICS_COUNT);
> +
> +   uint32_t reg = PIPELINE_STAT_TO_REG[stat];
> +
> +   anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
> +      lrm.RegisterAddress  = reg,
> +      lrm.MemoryAddress    = (struct anv_address) { bo, offset };
> +   }
> +   anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
> +      lrm.RegisterAddress  = reg + 4,
> +      lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
> +   }
> +}
> +
>  void genX(CmdBeginQuery)(
>      VkCommandBuffer                             commandBuffer,
>      VkQueryPool                                 queryPool,
> @@ -2301,7 +2345,25 @@ void genX(CmdBeginQuery)(
>                            query * sizeof(struct anv_query_pool_slot));
>        break;
>
> -   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
> +   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
> +      uint32_t pipeline_statistics = pool->pipeline_statistics;
> +      uint32_t slot_offset = query * pool->slot_stride *
> +         sizeof(struct anv_query_pool_slot);
> +
> +      /* TODO: This might only be necessary for certain stats */
> +      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +         pc.CommandStreamerStallEnable = true;
> +         pc.StallAtPixelScoreboard = true;
> +      }
> +
> +      while (pipeline_statistics) {
> +         uint32_t stat = u_bit_scan(&pipeline_statistics);
> +
> +         emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset);
> +         slot_offset += sizeof(struct anv_query_pool_slot);
> +      }
> +      break;
> +   }
>     default:
>        unreachable("");
>     }
> @@ -2314,17 +2376,35 @@ void genX(CmdEndQuery)(
>  {
>     ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>     ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
> +   uint32_t slot_offset = query * pool->slot_stride *
> +      sizeof(struct anv_query_pool_slot);
>
>     switch (pool->type) {
>     case VK_QUERY_TYPE_OCCLUSION:
> -      emit_ps_depth_count(cmd_buffer, &pool->bo,
> -                          query * sizeof(struct anv_query_pool_slot) + 8);
> +      emit_ps_depth_count(cmd_buffer, &pool->bo, slot_offset + 8);
> +      emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16);
> +      break;
> +
> +   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
> +      uint32_t pipeline_statistics = pool->pipeline_statistics;
> +      /* TODO: This might only be necessary for certain stats */
> +      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +         pc.CommandStreamerStallEnable = true;
> +         pc.StallAtPixelScoreboard = true;
> +      }
> +
> +      while (pipeline_statistics) {
> +         uint32_t stat = u_bit_scan(&pipeline_statistics);
>
> -      emit_query_availability(cmd_buffer, &pool->bo,
> -                              query * sizeof(struct anv_query_pool_slot)
> + 16);
> +         emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset +
> 8);
> +         slot_offset += sizeof(struct anv_query_pool_slot);
> +      }
> +
> +      slot_offset -= sizeof(struct anv_query_pool_slot);
> +      emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16);
>        break;
> +   }
>
> -   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
>     default:
>        unreachable("");
>     }
> @@ -2421,6 +2501,31 @@ emit_load_alu_reg_u64(struct anv_batch *batch,
> uint32_t reg,
>  }
>
>  static void
> +emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t
> imm)
> +{
> +   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
> +      lri.RegisterOffset   = reg;
> +      lri.DataDWord        = imm;
> +   }
> +}
> +
> +static void
> +emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t
> imm)
> +{
> +   emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
> +   emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
>

I don't think the casts are needed here.  They don't hurt though.

> +}
> +
> +static void
> +emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t
> dst)
> +{
> +   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
> +      lrr.SourceRegisterAddress      = src;
> +      lrr.DestinationRegisterAddress = dst;
> +   }
> +}
> +
> +static uint32_t
>  store_query_result(struct anv_batch *batch, uint32_t reg,
>                     struct anv_bo *bo, uint32_t offset, VkQueryResultFlags
> flags)
>  {
> @@ -2434,9 +2539,88 @@ store_query_result(struct anv_batch *batch,
> uint32_t reg,
>           srm.RegisterAddress  = reg + 4;
>           srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
>        }
> +
> +      return offset + 8;
> +   }
> +
> +   return offset + 4;
> +}
> +
> +/*
> + * GPR0 = GPR0 & ((1ull << n) - 1);
> + */
> +static void
> +keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
> +{
> +   assert(n < 64);
> +   emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
> +
> +   uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
> +   dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
> +   dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
> +   dw[3] = alu(OPCODE_AND, 0, 0);
> +   dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
> +}
> +
> +/*
> + * GPR0 = GPR0 << 30;
> + */
> +static void
> +shl_gpr0_by_30_bits(struct anv_batch *batch)
> +{
> +   /* First we mask 34 bits of GPR0 to prevent overflow */
> +   keep_gpr0_lower_n_bits(batch, 34);
> +
> +   const uint32_t outer_count = 5;
> +   const uint32_t inner_count = 6;
> +   STATIC_ASSERT(outer_count * inner_count == 30);
> +   const uint32_t cmd_len = 1 + inner_count * 4;
> +
> +   /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a
> total of
> +    * 30 left shifts.
>

Why do we need 5 MI_MATH commands?  Can't we do it in 1?

> +    */
> +   for (int o = 0; o < outer_count; o++) {
> +      /* Submit one MI_MATH to shift left by 6 bits */
> +      uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
> +      dw++;
> +      for (int i = 0; i < inner_count; i++, dw += 4) {
> +         dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
> +         dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
> +         dw[2] = alu(OPCODE_ADD, 0, 0);
> +         dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
> +      }
>     }
>  }
>
> +/*
> + * GPR0 = GPR0 >> 2;
> + *
> + * Note that the upper 30 bits of GPR are lost!
> + */
> +static void
> +shr_gpr0_by_2_bits(struct anv_batch *batch)
> +{
> +   shl_gpr0_by_30_bits(batch);
> +   emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
> +   emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
> +}
> +
> +static void
> +compute_query_result(struct anv_batch *batch, struct anv_bo *bo,
> +                     uint32_t dst_reg, uint32_t offset)
> +{
> +   emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
> +   emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
> +
> +   /* FIXME: We need to clamp the result for 32 bit. */
> +
> +   uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
> +   dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
> +   dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
> +   dw[3] = alu(OPCODE_SUB, 0, 0);
> +   dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
> +}
>

Ugh...  So, I've been thinking about this a bit and I'm actually starting
to wonder if we don't want to take a completely different approach to
CmdCopyQueryPoolResults.  Namely, to use a vertex shader and
trandform-feedback to pull query buffer data in one end and dump computed
query results out the other.  For small quantities of queries, the command
streamer math may be faster, but if they're pulling a lot of queries, a VS
may be more efficient.  Also, it's way more flexible in terms of the math
it allows you to do.  Talos pulls queries in blocks of 256.  If you were to
try and pull that many pipeline statistics queries, the amount of batch
space it would burn is insane.

I'm happy to be the one to play with this.  I'm also reasonably happy to
land all the CS math and then clean it up later with a VS if it turns out
to be the most practical path.  Thoughts?

> +
>  void genX(CmdCopyQueryPoolResults)(
>      VkCommandBuffer                             commandBuffer,
>      VkQueryPool                                 queryPool,
> @@ -2459,50 +2643,64 @@ void genX(CmdCopyQueryPoolResults)(
>        }
>     }
>
> -   dst_offset = buffer->offset + destOffset;
>     for (uint32_t i = 0; i < queryCount; i++) {
> -
> +      dst_offset = buffer->offset + destOffset + destStride * i;
>        slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
>        switch (pool->type) {
>        case VK_QUERY_TYPE_OCCLUSION:
> -         emit_load_alu_reg_u64(&cmd_buffer->batch,
> -                               CS_GPR(0), &pool->bo, slot_offset);
> -         emit_load_alu_reg_u64(&cmd_buffer->batch,
> -                               CS_GPR(1), &pool->bo, slot_offset + 8);
> -
> -         /* FIXME: We need to clamp the result for 32 bit. */
> -
> -         uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5,
> GENX(MI_MATH));
> -         dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
> -         dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
> -         dw[3] = alu(OPCODE_SUB, 0, 0);
> -         dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
> +         compute_query_result(&cmd_buffer->batch, &pool->bo, OPERAND_R2,
> +                              slot_offset);
> +         dst_offset = store_query_result(
> +               &cmd_buffer->batch,
> +               CS_GPR(2), buffer->bo, dst_offset, flags);
>           break;
>
>        case VK_QUERY_TYPE_TIMESTAMP:
>           emit_load_alu_reg_u64(&cmd_buffer->batch,
>                                 CS_GPR(2), &pool->bo, slot_offset);
> +         dst_offset = store_query_result(
> +               &cmd_buffer->batch,
> +               CS_GPR(2), buffer->bo, dst_offset, flags);
>           break;
>
> +      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
> +         uint32_t pipeline_statistics = pool->pipeline_statistics;
> +
> +         slot_offset *= pool->slot_stride;
> +         while (pipeline_statistics) {
> +            uint32_t stat = u_bit_scan(&pipeline_statistics);
> +
> +            compute_query_result(&cmd_buffer->batch, &pool->bo,
> OPERAND_R0,
> +                                 slot_offset);
> +
> +            /* WaDividePSInvocationCountBy4:HSW,BDW */
> +            if ((cmd_buffer->device->info.gen == 8 ||
> +                 cmd_buffer->device->info.is_haswell) &&
> +                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
> {
> +               shr_gpr0_by_2_bits(&cmd_buffer->batch);
> +            }
> +            dst_offset = store_query_result(
> +                  &cmd_buffer->batch,
> +                  CS_GPR(0), buffer->bo, dst_offset, flags);
> +            slot_offset += sizeof(struct anv_query_pool_slot);
> +         }
> +
> +         /* Get the slot offset to where it's supposed to be for the
> +          * availability bit.
> +          */
> +         slot_offset -= sizeof(struct anv_query_pool_slot);
> +         break;
> +      }
>        default:
>           unreachable("unhandled query type");
>        }
>
> -      store_query_result(&cmd_buffer->batch,
> -                         CS_GPR(2), buffer->bo, dst_offset, flags);
> -
>        if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
>           emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
>                                 &pool->bo, slot_offset + 16);
> -         if (flags & VK_QUERY_RESULT_64_BIT)
> -            store_query_result(&cmd_buffer->batch,
> -                               CS_GPR(0), buffer->bo, dst_offset + 8,
> flags);
> -         else
> -            store_query_result(&cmd_buffer->batch,
> -                               CS_GPR(0), buffer->bo, dst_offset + 4,
> flags);
> +         store_query_result(&cmd_buffer->batch,
> +                            CS_GPR(0), buffer->bo, dst_offset, flags);
>        }
> -
> -      dst_offset += destStride;
>     }
>  }
>
> --
> 2.7.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170221/662191e9/attachment-0001.html>