[Mesa-dev] [PATCH v2] radv: implement VK_EXT_sample_locations
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu May 16 09:55:02 UTC 2019
Basically, this extension allows applications to use custom
sample locations. It doesn't support variable sample locations
during subpass. Note that we don't have to upload the user
sample locations because the spec doesn't allow this.
Only enabled on VI+ because it's untested on older chips.
v2: - change sampleLocationCoordinateRange[1] to 0.9375
- compute and emit PA_SC_CENTROID_PRIORITY_{0,1}
- rebased on top of master
- some cleanups
Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
src/amd/vulkan/radv_cmd_buffer.c | 223 ++++++++++++++++++++++++++++++
src/amd/vulkan/radv_device.c | 27 ++++
src/amd/vulkan/radv_extensions.py | 1 +
src/amd/vulkan/radv_pipeline.c | 30 ++++
src/amd/vulkan/radv_private.h | 26 +++-
5 files changed, 300 insertions(+), 7 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 4f592bc7f68..fb79c1c6713 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -105,6 +105,7 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
dest->viewport.count = src->viewport.count;
dest->scissor.count = src->scissor.count;
dest->discard_rectangle.count = src->discard_rectangle.count;
+ dest->sample_location.count = src->sample_location.count;
if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
@@ -192,6 +193,22 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
}
}
+ if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
+ if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
+ dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
+ dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
+ memcmp(&dest->sample_location.locations,
+ &src->sample_location.locations,
+ src->sample_location.count * sizeof(VkSampleLocationEXT))) {
+ dest->sample_location.per_pixel = src->sample_location.per_pixel;
+ dest->sample_location.grid_size = src->sample_location.grid_size;
+ typed_memcpy(dest->sample_location.locations,
+ src->sample_location.locations,
+ src->sample_location.count);
+ dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
+ }
+ }
+
cmd_buffer->state.dirty |= dest_mask;
}
@@ -632,6 +649,190 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
}
}
+/**
+ * Convert the user sample locations to hardware sample locations (the values
+ * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
+ */
+static void
+radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
+ uint32_t x, uint32_t y, VkOffset2D *sample_locs)
+{
+ uint32_t x_offset = x % state->grid_size.width;
+ uint32_t y_offset = y % state->grid_size.height;
+ uint32_t num_samples = (uint32_t)state->per_pixel;
+ VkSampleLocationEXT *user_locs;
+ uint32_t pixel_offset;
+
+ pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
+
+ assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
+ user_locs = &state->locations[pixel_offset];
+
+ for (uint32_t i = 0; i < num_samples; i++) {
+ float shifted_pos_x = user_locs[i].x - 0.5;
+ float shifted_pos_y = user_locs[i].y - 0.5;
+
+ int32_t scaled_pos_x = floor(shifted_pos_x * 16);
+ int32_t scaled_pos_y = floor(shifted_pos_y * 16);
+
+ sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
+ sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
+ }
+}
+
+/**
+ * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
+ * locations.
+ */
+static void
+radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
+ uint32_t *sample_locs_pixel)
+{
+ for (uint32_t i = 0; i < num_samples; i++) {
+ uint32_t sample_reg_idx = i / 4;
+ uint32_t sample_loc_idx = i % 4;
+ int32_t pos_x = sample_locs[i].x;
+ int32_t pos_y = sample_locs[i].y;
+
+ uint32_t shift_x = 8 * sample_loc_idx;
+ uint32_t shift_y = shift_x + 4;
+
+ sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
+ sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
+ }
+}
+
+/**
+ * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
+ * sample locations.
+ */
+static uint64_t
+radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer,
+ VkOffset2D *sample_locs,
+ uint32_t num_samples)
+{
+ uint32_t centroid_priorities[num_samples];
+ uint32_t sample_mask = num_samples - 1;
+ uint32_t distances[num_samples];
+ uint64_t centroid_priority = 0;
+
+ /* Compute the distances from center for each sample. */
+ for (int i = 0; i < num_samples; i++) {
+ distances[i] = (sample_locs[i].x * sample_locs[i].x) +
+ (sample_locs[i].y * sample_locs[i].y);
+ }
+
+ /* Compute the centroid priorities by looking at the distances array. */
+ for (int i = 0; i < num_samples; i++) {
+ uint32_t min_idx = 0;
+
+ for (int j = 1; j < num_samples; j++) {
+ if (distances[j] < distances[min_idx])
+ min_idx = j;
+ }
+
+ centroid_priorities[i] = min_idx;
+ distances[min_idx] = 0xffffffff;
+ }
+
+ /* Compute the final centroid priority. */
+ for (int i = 0; i < 8; i++) {
+ centroid_priority |=
+ centroid_priorities[i & sample_mask] << (i * 4);
+ }
+
+ return centroid_priority << 32 | centroid_priority;
+}
+
+/**
+ * Emit the sample locations that are specified with VK_EXT_sample_locations.
+ */
+static void
+radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
+{
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ struct radv_multisample_state *ms = &pipeline->graphics.ms;
+ struct radv_sample_locations_state *sample_location =
+ &cmd_buffer->state.dynamic.sample_location;
+ uint32_t num_samples = (uint32_t)sample_location->per_pixel;
+ struct radeon_cmdbuf *cs = cmd_buffer->cs;
+ uint32_t sample_locs_pixel[4][2] = {};
+ VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
+ uint32_t max_sample_dist = 0;
+ uint64_t centroid_priority;
+
+ if (!cmd_buffer->state.dynamic.sample_location.count)
+ return;
+
+ /* Convert the user sample locations to hardware sample locations. */
+ radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
+ radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
+ radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
+ radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
+
+ /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
+ for (uint32_t i = 0; i < 4; i++) {
+ radv_compute_sample_locs_pixel(num_samples, sample_locs[i],
+ sample_locs_pixel[i]);
+ }
+
+ /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
+ centroid_priority =
+ radv_compute_centroid_priority(cmd_buffer, sample_locs[0],
+ num_samples);
+
+ /* Compute the maximum sample distance from the specified locations. */
+ for (uint32_t i = 0; i < num_samples; i++) {
+ VkOffset2D offset = sample_locs[0][i];
+ max_sample_dist = MAX2(max_sample_dist,
+ MAX2(abs(offset.x), abs(offset.y)));
+ }
+
+ /* Emit the specified user sample locations. */
+ switch (num_samples) {
+ case 2:
+ case 4:
+ radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
+ radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
+ radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
+ radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
+ break;
+ case 8:
+ radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
+ radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
+ radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
+ radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
+ radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
+ radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
+ radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
+ radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
+ break;
+ default:
+ unreachable("invalid number of samples");
+ }
+
+ /* Emit the maximum sample distance and the centroid priority. */
+ uint32_t pa_sc_aa_config = ms->pa_sc_aa_config;
+
+ pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST;
+ pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist);
+
+ radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1);
+ radeon_emit(cs, pa_sc_aa_config);
+
+ radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+ radeon_emit(cs, centroid_priority);
+ radeon_emit(cs, centroid_priority >> 32);
+
+ /* GFX9: Flush DFSM when the AA mode changes. */
+ if (cmd_buffer->device->dfsm_allowed) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+ }
+
+ cmd_buffer->state.context_roll_without_scissor_emitted = true;
+}
+
static void
radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
struct radv_pipeline *pipeline,
@@ -1775,6 +1976,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
radv_emit_discard_rectangle(cmd_buffer);
+ if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
+ radv_emit_sample_locations(cmd_buffer);
+
cmd_buffer->state.dirty &= ~states;
}
@@ -3266,6 +3470,25 @@ void radv_CmdSetDiscardRectangleEXT(
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
}
+void radv_CmdSetSampleLocationsEXT(
+ VkCommandBuffer commandBuffer,
+ const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
+{
+ RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct radv_cmd_state *state = &cmd_buffer->state;
+
+ assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
+
+ state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
+ state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
+ state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
+ typed_memcpy(&state->dynamic.sample_location.locations[0],
+ pSampleLocationsInfo->pSampleLocations,
+ pSampleLocationsInfo->sampleLocationsCount);
+
+ state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
+}
+
void radv_CmdExecuteCommands(
VkCommandBuffer commandBuffer,
uint32_t commandBufferCount,
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index c0e317a97e5..e523bba1223 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1360,6 +1360,19 @@ void radv_GetPhysicalDeviceProperties2(
props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT;
break;
}
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: {
+ VkPhysicalDeviceSampleLocationsPropertiesEXT *properties =
+ (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext;
+ properties->sampleLocationSampleCounts = VK_SAMPLE_COUNT_2_BIT |
+ VK_SAMPLE_COUNT_4_BIT |
+ VK_SAMPLE_COUNT_8_BIT;
+ properties->maxSampleLocationGridSize = (VkExtent2D){ 2 , 2 };
+ properties->sampleLocationCoordinateRange[0] = 0.0f;
+ properties->sampleLocationCoordinateRange[1] = 0.9375f;
+ properties->sampleLocationSubPixelBits = 4;
+ properties->variableSampleLocations = VK_FALSE;
+ break;
+ }
default:
break;
}
@@ -5361,3 +5374,17 @@ VkResult radv_GetCalibratedTimestampsEXT(
return VK_SUCCESS;
}
+
+void radv_GetPhysicalDeviceMultisamplePropertiesEXT(
+ VkPhysicalDevice physicalDevice,
+ VkSampleCountFlagBits samples,
+ VkMultisamplePropertiesEXT* pMultisampleProperties)
+{
+ if (samples & (VK_SAMPLE_COUNT_2_BIT |
+ VK_SAMPLE_COUNT_4_BIT |
+ VK_SAMPLE_COUNT_8_BIT)) {
+ pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 2, 2 };
+ } else {
+ pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 0, 0 };
+ }
+}
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 0b5af56a435..2c2c4dc74d2 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -119,6 +119,7 @@ EXTENSIONS = [
Extension('VK_EXT_memory_priority', 1, True),
Extension('VK_EXT_pci_bus_info', 2, True),
Extension('VK_EXT_pipeline_creation_feedback', 1, True),
+ Extension('VK_EXT_sample_locations', 1, 'device->rad_info.chip_class >= GFX8'),
Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= GFX7'),
Extension('VK_EXT_scalar_block_layout', 1, 'device->rad_info.chip_class >= GFX7'),
Extension('VK_EXT_shader_viewport_index_layer', 1, True),
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 56fd65bec29..7f8e00b5c3c 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1267,6 +1267,8 @@ static unsigned radv_dynamic_state_mask(VkDynamicState state)
return RADV_DYNAMIC_STENCIL_REFERENCE;
case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
return RADV_DYNAMIC_DISCARD_RECTANGLE;
+ case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
+ return RADV_DYNAMIC_SAMPLE_LOCATIONS;
default:
unreachable("Unhandled dynamic state");
}
@@ -1297,6 +1299,11 @@ static uint32_t radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreat
if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT))
states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
+ if (!pCreateInfo->pMultisampleState ||
+ !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
+ PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
+ states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
+
/* TODO: blend constants & line width. */
return states;
@@ -1426,6 +1433,29 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
}
}
+ if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
+ const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
+ vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
+ PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
+ /* If sampleLocationsEnable is VK_FALSE, the default sample
+ * locations are used and the values specified in
+ * sampleLocationsInfo are ignored.
+ */
+ if (sample_location_info->sampleLocationsEnable) {
+ const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
+ &sample_location_info->sampleLocationsInfo;
+
+ assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
+
+ dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
+ dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
+ dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
+ typed_memcpy(&dynamic->sample_location.locations[0],
+ pSampleLocationsInfo->pSampleLocations,
+ pSampleLocationsInfo->sampleLocationsCount);
+ }
+ }
+
pipeline->dynamic_state.mask = states;
}
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 7834a505562..5c84bf74352 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -91,6 +91,7 @@ typedef uint32_t xcb_window_t;
#define MAX_VIEWPORTS 16
#define MAX_SCISSORS 16
#define MAX_DISCARD_RECTANGLES 4
+#define MAX_SAMPLE_LOCATIONS 32
#define MAX_PUSH_CONSTANTS_SIZE 128
#define MAX_PUSH_DESCRIPTORS 32
#define MAX_DYNAMIC_UNIFORM_BUFFERS 16
@@ -852,7 +853,8 @@ enum radv_dynamic_state_bits {
RADV_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7,
RADV_DYNAMIC_STENCIL_REFERENCE = 1 << 8,
RADV_DYNAMIC_DISCARD_RECTANGLE = 1 << 9,
- RADV_DYNAMIC_ALL = (1 << 10) - 1,
+ RADV_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10,
+ RADV_DYNAMIC_ALL = (1 << 11) - 1,
};
enum radv_cmd_dirty_bits {
@@ -868,12 +870,13 @@ enum radv_cmd_dirty_bits {
RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7,
RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 8,
RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE = 1 << 9,
- RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 10) - 1,
- RADV_CMD_DIRTY_PIPELINE = 1 << 10,
- RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 11,
- RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 12,
- RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 13,
- RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 14,
+ RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10,
+ RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 11) - 1,
+ RADV_CMD_DIRTY_PIPELINE = 1 << 11,
+ RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 12,
+ RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 13,
+ RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 14,
+ RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 15,
};
enum radv_cmd_flush_bits {
@@ -950,6 +953,13 @@ struct radv_discard_rectangle_state {
VkRect2D rectangles[MAX_DISCARD_RECTANGLES];
};
+struct radv_sample_locations_state {
+ VkSampleCountFlagBits per_pixel;
+ VkExtent2D grid_size;
+ uint32_t count;
+ VkSampleLocationEXT locations[MAX_SAMPLE_LOCATIONS];
+};
+
struct radv_dynamic_state {
/**
* Bitmask of (1 << VK_DYNAMIC_STATE_*).
@@ -992,6 +1002,8 @@ struct radv_dynamic_state {
} stencil_reference;
struct radv_discard_rectangle_state discard_rectangle;
+
+ struct radv_sample_locations_state sample_location;
};
extern const struct radv_dynamic_state default_dynamic_state;
--
2.21.0
More information about the mesa-dev
mailing list