[Mesa-dev] [PATCH v2] winsys/amdgpu: explicitly declare whether buffer_map is permanent or not
Marek Olšák
maraeo at gmail.com
Sat Nov 24 02:07:54 UTC 2018
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
On Thu, Nov 22, 2018 at 6:32 AM Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Introduce a new driver-private transfer flag RADEON_TRANSFER_TEMPORARY
> that specifies whether the caller will use buffer_unmap or not. The
> default behavior is set to permanent maps, because that's what drivers
> do for Gallium buffer maps.
>
> This should eliminate the need for hacks in libdrm. Assertions are added
> to catch when the buffer_unmap calls don't match the (temporary)
> buffer_map calls.
>
> I did my best to update r600 for consistency (r300 needs no changes
> because it never calls buffer_unmap), even though the radeon winsys
> ignores the new flag.
>
> As an added bonus, this should actually improve the performance of
> the normal fast path, because we no longer call into libdrm at all
> after the first map, and there's one less atomic in the winsys itself
> (there are now no atomics left in the UNSYNCHRONIZED fast path).
>
> Cc: Leo Liu <leo.liu at amd.com>
> v2:
> - remove comment about visible VRAM (Marek)
> - don't rely on amdgpu_bo_cpu_map doing an atomic write
> ---
> src/gallium/drivers/r600/evergreen_compute.c | 4 +-
> src/gallium/drivers/r600/r600_asm.c | 4 +-
> src/gallium/drivers/r600/r600_shader.c | 4 +-
> src/gallium/drivers/r600/radeon_uvd.c | 8 +-
> src/gallium/drivers/r600/radeon_vce.c | 4 +-
> src/gallium/drivers/r600/radeon_video.c | 6 +-
> src/gallium/drivers/radeon/radeon_uvd.c | 10 +-
> src/gallium/drivers/radeon/radeon_uvd_enc.c | 6 +-
> src/gallium/drivers/radeon/radeon_vce.c | 4 +-
> src/gallium/drivers/radeon/radeon_vcn_dec.c | 18 ++--
> src/gallium/drivers/radeon/radeon_vcn_enc.c | 4 +-
> src/gallium/drivers/radeon/radeon_video.c | 6 +-
> src/gallium/drivers/radeon/radeon_winsys.h | 14 ++-
> src/gallium/drivers/radeonsi/si_shader.c | 3 +-
> src/gallium/include/pipe/p_defines.h | 8 +-
> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 96 +++++++++++++-------
> src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 3 +-
> 17 files changed, 140 insertions(+), 62 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/evergreen_compute.c
> b/src/gallium/drivers/r600/evergreen_compute.c
> index a77f58242e3..9085be4e2f3 100644
> --- a/src/gallium/drivers/r600/evergreen_compute.c
> +++ b/src/gallium/drivers/r600/evergreen_compute.c
> @@ -431,21 +431,23 @@ static void *evergreen_create_compute_state(struct
> pipe_context *ctx,
> COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
> header = cso->prog;
> code = cso->prog + sizeof(struct pipe_llvm_program_header);
> radeon_shader_binary_init(&shader->binary);
> r600_elf_read(code, header->num_bytes, &shader->binary);
> r600_create_shader(&shader->bc, &shader->binary, &use_kill);
>
> /* Upload code + ROdata */
> shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
> shader->bc.ndw *
> 4);
> - p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo,
> PIPE_TRANSFER_WRITE);
> + p = r600_buffer_map_sync_with_rings(
> + &rctx->b, shader->code_bo,
> + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
> //TODO: use util_memcpy_cpu_to_le32 ?
> memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
> rctx->b.ws->buffer_unmap(shader->code_bo->buf);
> #endif
>
> return shader;
> }
>
> static void evergreen_delete_compute_state(struct pipe_context *ctx, void
> *state)
> {
> diff --git a/src/gallium/drivers/r600/r600_asm.c
> b/src/gallium/drivers/r600/r600_asm.c
> index 7029be24f4b..4ba77c535f9 100644
> --- a/src/gallium/drivers/r600/r600_asm.c
> +++ b/src/gallium/drivers/r600/r600_asm.c
> @@ -2765,21 +2765,23 @@ void *r600_create_vertex_fetch_shader(struct
> pipe_context *ctx,
>
> u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256,
> &shader->offset,
> (struct pipe_resource**)&shader->buffer);
> if (!shader->buffer) {
> r600_bytecode_clear(&bc);
> FREE(shader);
> return NULL;
> }
>
> - bytecode = r600_buffer_map_sync_with_rings(&rctx->b,
> shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
> + bytecode = r600_buffer_map_sync_with_rings
> + (&rctx->b, shader->buffer,
> + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED |
> RADEON_TRANSFER_TEMPORARY);
> bytecode += shader->offset / 4;
>
> if (R600_BIG_ENDIAN) {
> for (i = 0; i < fs_size / 4; ++i) {
> bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
> }
> } else {
> memcpy(bytecode, bc.bytecode, fs_size);
> }
> rctx->b.ws->buffer_unmap(shader->buffer->buf);
> diff --git a/src/gallium/drivers/r600/r600_shader.c
> b/src/gallium/drivers/r600/r600_shader.c
> index 408939d1105..fc826470d69 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -134,21 +134,23 @@ static int store_shader(struct pipe_context *ctx,
> {
> struct r600_context *rctx = (struct r600_context *)ctx;
> uint32_t *ptr, i;
>
> if (shader->bo == NULL) {
> shader->bo = (struct r600_resource*)
> pipe_buffer_create(ctx->screen, 0,
> PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
> if (shader->bo == NULL) {
> return -ENOMEM;
> }
> - ptr = r600_buffer_map_sync_with_rings(&rctx->b,
> shader->bo, PIPE_TRANSFER_WRITE);
> + ptr = r600_buffer_map_sync_with_rings(
> + &rctx->b, shader->bo,
> + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
> if (R600_BIG_ENDIAN) {
> for (i = 0; i < shader->shader.bc.ndw; ++i) {
> ptr[i] =
> util_cpu_to_le32(shader->shader.bc.bytecode[i]);
> }
> } else {
> memcpy(ptr, shader->shader.bc.bytecode,
> shader->shader.bc.ndw * sizeof(*ptr));
> }
> rctx->b.ws->buffer_unmap(shader->bo->buf);
> }
>
> diff --git a/src/gallium/drivers/r600/radeon_uvd.c
> b/src/gallium/drivers/r600/radeon_uvd.c
> index 495a93dc55a..5568f2138e4 100644
> --- a/src/gallium/drivers/r600/radeon_uvd.c
> +++ b/src/gallium/drivers/r600/radeon_uvd.c
> @@ -145,21 +145,22 @@ static bool have_it(struct ruvd_decoder *dec)
> /* map the next available message/feedback/itscaling buffer */
> static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
> {
> struct rvid_buffer* buf;
> uint8_t *ptr;
>
> /* grab the current message/feedback buffer */
> buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
>
> /* and map it for CPU access */
> - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> PIPE_TRANSFER_WRITE);
> + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
>
> /* calc buffer offsets */
> dec->msg = (struct ruvd_msg *)ptr;
> memset(dec->msg, 0, sizeof(*dec->msg));
>
> dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
> if (have_it(dec))
> dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET +
> dec->fb_size);
> }
>
> @@ -1061,21 +1062,21 @@ static void ruvd_begin_frame(struct
> pipe_video_codec *decoder,
>
> assert(decoder);
>
> frame = ++dec->frame_number;
> vl_video_buffer_set_associated_data(target, decoder, (void *)frame,
> &ruvd_destroy_associated_data);
>
> dec->bs_size = 0;
> dec->bs_ptr = dec->ws->buffer_map(
> dec->bs_buffers[dec->cur_buffer].res->buf,
> - dec->cs, PIPE_TRANSFER_WRITE);
> + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
> }
>
> /**
> * decode a macroblock
> */
> static void ruvd_decode_macroblock(struct pipe_video_codec *decoder,
> struct pipe_video_buffer *target,
> struct pipe_picture_desc *picture,
> const struct pipe_macroblock
> *macroblocks,
> unsigned num_macroblocks)
> @@ -1114,21 +1115,22 @@ static void ruvd_decode_bitstream(struct
> pipe_video_codec *decoder,
> new_size += 2; /* save for EOI */
>
> if (new_size > buf->res->buf->size) {
> dec->ws->buffer_unmap(buf->res->buf);
> if (!rvid_resize_buffer(dec->screen, dec->cs, buf,
> new_size)) {
> RVID_ERR("Can't resize bitstream buffer!");
> return;
> }
>
> dec->bs_ptr = dec->ws->buffer_map(buf->res->buf,
> dec->cs,
> -
> PIPE_TRANSFER_WRITE);
> +
> PIPE_TRANSFER_WRITE |
> +
> RADEON_TRANSFER_TEMPORARY);
> if (!dec->bs_ptr)
> return;
>
> dec->bs_ptr += dec->bs_size;
> }
>
> memcpy(dec->bs_ptr, buffers[i], sizes[i]);
> dec->bs_size += sizes[i];
> dec->bs_ptr += sizes[i];
> }
> diff --git a/src/gallium/drivers/r600/radeon_vce.c
> b/src/gallium/drivers/r600/radeon_vce.c
> index 60ba12a593a..e38b927b1d4 100644
> --- a/src/gallium/drivers/r600/radeon_vce.c
> +++ b/src/gallium/drivers/r600/radeon_vce.c
> @@ -346,21 +346,23 @@ static void rvce_end_frame(struct pipe_video_codec
> *encoder,
> }
> }
>
> static void rvce_get_feedback(struct pipe_video_codec *encoder,
> void *feedback, unsigned *size)
> {
> struct rvce_encoder *enc = (struct rvce_encoder*)encoder;
> struct rvid_buffer *fb = feedback;
>
> if (size) {
> - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs,
> PIPE_TRANSFER_READ_WRITE);
> + uint32_t *ptr = enc->ws->buffer_map(
> + fb->res->buf, enc->cs,
> + PIPE_TRANSFER_READ_WRITE |
> RADEON_TRANSFER_TEMPORARY);
>
> if (ptr[1]) {
> *size = ptr[4] - ptr[9];
> } else {
> *size = 0;
> }
>
> enc->ws->buffer_unmap(fb->res->buf);
> }
> //dump_feedback(enc, fb);
> diff --git a/src/gallium/drivers/r600/radeon_video.c
> b/src/gallium/drivers/r600/radeon_video.c
> index 02fcf77d4ff..8e0af448be5 100644
> --- a/src/gallium/drivers/r600/radeon_video.c
> +++ b/src/gallium/drivers/r600/radeon_video.c
> @@ -90,25 +90,27 @@ bool rvid_resize_buffer(struct pipe_screen *screen,
> struct radeon_cmdbuf *cs,
> {
> struct r600_common_screen *rscreen = (struct r600_common_screen
> *)screen;
> struct radeon_winsys* ws = rscreen->ws;
> unsigned bytes = MIN2(new_buf->res->buf->size, new_size);
> struct rvid_buffer old_buf = *new_buf;
> void *src = NULL, *dst = NULL;
>
> if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage))
> goto error;
>
> - src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);
> + src = ws->buffer_map(old_buf.res->buf, cs,
> + PIPE_TRANSFER_READ |
> RADEON_TRANSFER_TEMPORARY);
> if (!src)
> goto error;
>
> - dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);
> + dst = ws->buffer_map(new_buf->res->buf, cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> if (!dst)
> goto error;
>
> memcpy(dst, src, bytes);
> if (new_size > bytes) {
> new_size -= bytes;
> dst += bytes;
> memset(dst, 0, new_size);
> }
> ws->buffer_unmap(new_buf->res->buf);
> diff --git a/src/gallium/drivers/radeon/radeon_uvd.c
> b/src/gallium/drivers/radeon/radeon_uvd.c
> index 62af1a311c2..ca066e89823 100644
> --- a/src/gallium/drivers/radeon/radeon_uvd.c
> +++ b/src/gallium/drivers/radeon/radeon_uvd.c
> @@ -141,21 +141,22 @@ static bool have_it(struct ruvd_decoder *dec)
> /* map the next available message/feedback/itscaling buffer */
> static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
> {
> struct rvid_buffer* buf;
> uint8_t *ptr;
>
> /* grab the current message/feedback buffer */
> buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
>
> /* and map it for CPU access */
> - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> PIPE_TRANSFER_WRITE);
> + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
>
> /* calc buffer offsets */
> dec->msg = (struct ruvd_msg *)ptr;
> memset(dec->msg, 0, sizeof(*dec->msg));
>
> dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
> if (have_it(dec))
> dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET +
> dec->fb_size);
> }
>
> @@ -1008,21 +1009,21 @@ static void ruvd_begin_frame(struct
> pipe_video_codec *decoder,
>
> assert(decoder);
>
> frame = ++dec->frame_number;
> vl_video_buffer_set_associated_data(target, decoder, (void *)frame,
> &ruvd_destroy_associated_data);
>
> dec->bs_size = 0;
> dec->bs_ptr = dec->ws->buffer_map(
> dec->bs_buffers[dec->cur_buffer].res->buf,
> - dec->cs, PIPE_TRANSFER_WRITE);
> + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
> }
>
> /**
> * decode a macroblock
> */
> static void ruvd_decode_macroblock(struct pipe_video_codec *decoder,
> struct pipe_video_buffer *target,
> struct pipe_picture_desc *picture,
> const struct pipe_macroblock
> *macroblocks,
> unsigned num_macroblocks)
> @@ -1053,22 +1054,23 @@ static void ruvd_decode_bitstream(struct
> pipe_video_codec *decoder,
> struct rvid_buffer *buf =
> &dec->bs_buffers[dec->cur_buffer];
> unsigned new_size = dec->bs_size + sizes[i];
>
> if (new_size > buf->res->buf->size) {
> dec->ws->buffer_unmap(buf->res->buf);
> if (!si_vid_resize_buffer(dec->screen, dec->cs,
> buf, new_size)) {
> RVID_ERR("Can't resize bitstream buffer!");
> return;
> }
>
> - dec->bs_ptr = dec->ws->buffer_map(buf->res->buf,
> dec->cs,
> -
> PIPE_TRANSFER_WRITE);
> + dec->bs_ptr = dec->ws->buffer_map(
> + buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> if (!dec->bs_ptr)
> return;
>
> dec->bs_ptr += dec->bs_size;
> }
>
> memcpy(dec->bs_ptr, buffers[i], sizes[i]);
> dec->bs_size += sizes[i];
> dec->bs_ptr += sizes[i];
> }
> diff --git a/src/gallium/drivers/radeon/radeon_uvd_enc.c
> b/src/gallium/drivers/radeon/radeon_uvd_enc.c
> index 4384e5e1646..3164dbb2c20 100644
> --- a/src/gallium/drivers/radeon/radeon_uvd_enc.c
> +++ b/src/gallium/drivers/radeon/radeon_uvd_enc.c
> @@ -256,23 +256,23 @@ radeon_uvd_enc_destroy(struct pipe_video_codec
> *encoder)
>
> static void
> radeon_uvd_enc_get_feedback(struct pipe_video_codec *encoder,
> void *feedback, unsigned *size)
> {
> struct radeon_uvd_encoder *enc = (struct radeon_uvd_encoder *) encoder;
> struct rvid_buffer *fb = feedback;
>
> if (NULL != size) {
> radeon_uvd_enc_feedback_t *fb_data =
> - (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map(fb->res->buf,
> - enc->cs,
> -
> PIPE_TRANSFER_READ_WRITE);
> + (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map(
> + fb->res->buf, enc->cs,
> + PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY);
>
> if (!fb_data->status)
> *size = fb_data->bitstream_size;
> else
> *size = 0;
> enc->ws->buffer_unmap(fb->res->buf);
> }
>
> si_vid_destroy_buffer(fb);
> FREE(fb);
> diff --git a/src/gallium/drivers/radeon/radeon_vce.c
> b/src/gallium/drivers/radeon/radeon_vce.c
> index 310d1654b05..94df06e88c6 100644
> --- a/src/gallium/drivers/radeon/radeon_vce.c
> +++ b/src/gallium/drivers/radeon/radeon_vce.c
> @@ -345,21 +345,23 @@ static void rvce_end_frame(struct pipe_video_codec
> *encoder,
> }
> }
>
> static void rvce_get_feedback(struct pipe_video_codec *encoder,
> void *feedback, unsigned *size)
> {
> struct rvce_encoder *enc = (struct rvce_encoder*)encoder;
> struct rvid_buffer *fb = feedback;
>
> if (size) {
> - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs,
> PIPE_TRANSFER_READ_WRITE);
> + uint32_t *ptr = enc->ws->buffer_map(
> + fb->res->buf, enc->cs,
> + PIPE_TRANSFER_READ_WRITE |
> RADEON_TRANSFER_TEMPORARY);
>
> if (ptr[1]) {
> *size = ptr[4] - ptr[9];
> } else {
> *size = 0;
> }
>
> enc->ws->buffer_unmap(fb->res->buf);
> }
> //dump_feedback(enc, fb);
> diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c
> b/src/gallium/drivers/radeon/radeon_vcn_dec.c
> index 1ee85ae3d3f..e402af21a64 100644
> --- a/src/gallium/drivers/radeon/radeon_vcn_dec.c
> +++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c
> @@ -934,21 +934,23 @@ static struct pb_buffer
> *rvcn_dec_message_decode(struct radeon_decoder *dec,
> ctx_size += 8 * 2 * 4096;
>
> if (dec->base.profile ==
> PIPE_VIDEO_PROFILE_VP9_PROFILE2)
> ctx_size += 8 * 2 * 4096;
>
> if (!si_vid_create_buffer(dec->screen, &dec->ctx,
> ctx_size, PIPE_USAGE_DEFAULT))
> RVID_ERR("Can't allocated context
> buffer.\n");
> si_vid_clear_buffer(dec->base.context, &dec->ctx);
>
> /* ctx needs probs table */
> - ptr = dec->ws->buffer_map(dec->ctx.res->buf,
> dec->cs, PIPE_TRANSFER_WRITE);
> + ptr = dec->ws->buffer_map(
> + dec->ctx.res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> fill_probs_table(ptr);
> dec->ws->buffer_unmap(dec->ctx.res->buf);
> }
> break;
> }
> default:
> assert(0);
> return NULL;
> }
>
> @@ -1027,21 +1029,22 @@ static bool have_probs(struct radeon_decoder *dec)
> /* map the next available message/feedback/itscaling buffer */
> static void map_msg_fb_it_probs_buf(struct radeon_decoder *dec)
> {
> struct rvid_buffer* buf;
> uint8_t *ptr;
>
> /* grab the current message/feedback buffer */
> buf = &dec->msg_fb_it_probs_buffers[dec->cur_buffer];
>
> /* and map it for CPU access */
> - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> PIPE_TRANSFER_WRITE);
> + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
>
> /* calc buffer offsets */
> dec->msg = ptr;
>
> dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
> if (have_it(dec))
> dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET +
> FB_BUFFER_SIZE);
> else if (have_probs(dec))
> dec->probs = (uint8_t *)(ptr + FB_BUFFER_OFFSET +
> FB_BUFFER_SIZE);
> }
> @@ -1305,21 +1308,21 @@ static void radeon_dec_begin_frame(struct
> pipe_video_codec *decoder,
> assert(decoder);
>
> frame = ++dec->frame_number;
> if (dec->stream_type != RDECODE_CODEC_VP9)
> vl_video_buffer_set_associated_data(target, decoder, (void
> *)frame,
>
> &radeon_dec_destroy_associated_data);
>
> dec->bs_size = 0;
> dec->bs_ptr = dec->ws->buffer_map(
> dec->bs_buffers[dec->cur_buffer].res->buf,
> - dec->cs, PIPE_TRANSFER_WRITE);
> + dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
> }
>
> /**
> * decode a macroblock
> */
> static void radeon_dec_decode_macroblock(struct pipe_video_codec *decoder,
> struct pipe_video_buffer *target,
> struct pipe_picture_desc *picture,
> const struct pipe_macroblock
> *macroblocks,
> unsigned num_macroblocks)
> @@ -1350,22 +1353,23 @@ static void radeon_dec_decode_bitstream(struct
> pipe_video_codec *decoder,
> struct rvid_buffer *buf =
> &dec->bs_buffers[dec->cur_buffer];
> unsigned new_size = dec->bs_size + sizes[i];
>
> if (new_size > buf->res->buf->size) {
> dec->ws->buffer_unmap(buf->res->buf);
> if (!si_vid_resize_buffer(dec->screen, dec->cs,
> buf, new_size)) {
> RVID_ERR("Can't resize bitstream buffer!");
> return;
> }
>
> - dec->bs_ptr = dec->ws->buffer_map(buf->res->buf,
> dec->cs,
> -
> PIPE_TRANSFER_WRITE);
> + dec->bs_ptr = dec->ws->buffer_map(
> + buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> if (!dec->bs_ptr)
> return;
>
> dec->bs_ptr += dec->bs_size;
> }
>
> memcpy(dec->bs_ptr, buffers[i], sizes[i]);
> dec->bs_size += sizes[i];
> dec->bs_ptr += sizes[i];
> }
> @@ -1536,21 +1540,23 @@ struct pipe_video_codec
> *radeon_create_decoder(struct pipe_context *context,
> }
>
> si_vid_clear_buffer(context,
> &dec->msg_fb_it_probs_buffers[i]);
> si_vid_clear_buffer(context, &dec->bs_buffers[i]);
>
> if (have_probs(dec)) {
> struct rvid_buffer* buf;
> void *ptr;
>
> buf = &dec->msg_fb_it_probs_buffers[i];
> - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
> PIPE_TRANSFER_WRITE);
> + ptr = dec->ws->buffer_map(
> + buf->res->buf, dec->cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> ptr += FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
> fill_probs_table(ptr);
> dec->ws->buffer_unmap(buf->res->buf);
> }
> }
>
> dpb_size = calc_dpb_size(dec);
> if (dpb_size) {
> if (!si_vid_create_buffer(dec->screen, &dec->dpb,
> dpb_size, PIPE_USAGE_DEFAULT)) {
> RVID_ERR("Can't allocated dpb.\n");
> diff --git a/src/gallium/drivers/radeon/radeon_vcn_enc.c
> b/src/gallium/drivers/radeon/radeon_vcn_enc.c
> index e8676f6c721..7d64a28a405 100644
> --- a/src/gallium/drivers/radeon/radeon_vcn_enc.c
> +++ b/src/gallium/drivers/radeon/radeon_vcn_enc.c
> @@ -237,21 +237,23 @@ static void radeon_enc_destroy(struct
> pipe_video_codec *encoder)
> FREE(enc);
> }
>
> static void radeon_enc_get_feedback(struct pipe_video_codec *encoder,
> void *feedback,
> unsigned *size)
> {
> struct radeon_encoder *enc = (struct radeon_encoder*)encoder;
> struct rvid_buffer *fb = feedback;
>
> if (size) {
> - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs,
> PIPE_TRANSFER_READ_WRITE);
> + uint32_t *ptr = enc->ws->buffer_map(
> + fb->res->buf, enc->cs,
> + PIPE_TRANSFER_READ_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> if (ptr[1])
> *size = ptr[6];
> else
> *size = 0;
> enc->ws->buffer_unmap(fb->res->buf);
> }
>
> si_vid_destroy_buffer(fb);
> FREE(fb);
> }
> diff --git a/src/gallium/drivers/radeon/radeon_video.c
> b/src/gallium/drivers/radeon/radeon_video.c
> index a39ce4cc73e..bb1173e8005 100644
> --- a/src/gallium/drivers/radeon/radeon_video.c
> +++ b/src/gallium/drivers/radeon/radeon_video.c
> @@ -81,25 +81,27 @@ bool si_vid_resize_buffer(struct pipe_screen *screen,
> struct radeon_cmdbuf *cs,
> {
> struct si_screen *sscreen = (struct si_screen *)screen;
> struct radeon_winsys* ws = sscreen->ws;
> unsigned bytes = MIN2(new_buf->res->buf->size, new_size);
> struct rvid_buffer old_buf = *new_buf;
> void *src = NULL, *dst = NULL;
>
> if (!si_vid_create_buffer(screen, new_buf, new_size,
> new_buf->usage))
> goto error;
>
> - src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);
> + src = ws->buffer_map(old_buf.res->buf, cs,
> + PIPE_TRANSFER_READ |
> RADEON_TRANSFER_TEMPORARY);
> if (!src)
> goto error;
>
> - dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);
> + dst = ws->buffer_map(new_buf->res->buf, cs,
> + PIPE_TRANSFER_WRITE |
> RADEON_TRANSFER_TEMPORARY);
> if (!dst)
> goto error;
>
> memcpy(dst, src, bytes);
> if (new_size > bytes) {
> new_size -= bytes;
> dst += bytes;
> memset(dst, 0, new_size);
> }
> ws->buffer_unmap(new_buf->res->buf);
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
> b/src/gallium/drivers/radeon/radeon_winsys.h
> index 49f8bb279e5..a56ff75ad24 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -69,20 +69,29 @@ enum radeon_bo_usage { /* bitfield */
> RADEON_USAGE_READ = 2,
> RADEON_USAGE_WRITE = 4,
> RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
>
> /* The winsys ensures that the CS submission will be scheduled after
> * previously flushed CSs referencing this BO in a conflicting way.
> */
> RADEON_USAGE_SYNCHRONIZED = 8
> };
>
> +enum radeon_transfer_flags {
> + /* Indicates that the caller will unmap the buffer.
> + *
> + * Not unmapping buffers is an important performance optimization for
> + * OpenGL (avoids kernel overhead for frequently mapped buffers).
> + */
> + RADEON_TRANSFER_TEMPORARY = (PIPE_TRANSFER_DRV_PRV << 0),
> +};
> +
> #define RADEON_SPARSE_PAGE_SIZE (64 * 1024)
>
> enum ring_type {
> RING_GFX = 0,
> RING_COMPUTE,
> RING_DMA,
> RING_UVD,
> RING_VCE,
> RING_UVD_ENC,
> RING_VCN_DEC,
> @@ -287,23 +296,26 @@ struct radeon_winsys {
> struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws,
> uint64_t size,
> unsigned alignment,
> enum radeon_bo_domain domain,
> enum radeon_bo_flag flags);
>
> /**
> * Map the entire data store of a buffer object into the client's
> address
> * space.
> *
> + * Callers are expected to unmap buffers again if and only if the
> + * RADEON_TRANSFER_TEMPORARY flag is set in \p usage.
> + *
> * \param buf A winsys buffer object to map.
> * \param cs A command stream to flush if the buffer is
> referenced by it.
> - * \param usage A bitmask of the PIPE_TRANSFER_* flags.
> + * \param usage A bitmask of the PIPE_TRANSFER_* and
> RADEON_TRANSFER_* flags.
> * \return The pointer at the beginning of the buffer.
> */
> void *(*buffer_map)(struct pb_buffer *buf,
> struct radeon_cmdbuf *cs,
> enum pipe_transfer_usage usage);
>
> /**
> * Unmap a buffer object from the client's address space.
> *
> * \param buf A winsys buffer object to unmap.
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
> b/src/gallium/drivers/radeonsi/si_shader.c
> index 19522cc97b1..d455fb5db6a 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5286,21 +5286,22 @@ int si_shader_binary_upload(struct si_screen
> *sscreen, struct si_shader *shader)
> 0 :
> SI_RESOURCE_FLAG_READ_ONLY,
> PIPE_USAGE_IMMUTABLE,
> align(bo_size,
> SI_CPDMA_ALIGNMENT),
> 256);
> if (!shader->bo)
> return -ENOMEM;
>
> /* Upload. */
> ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
> PIPE_TRANSFER_READ_WRITE |
> - PIPE_TRANSFER_UNSYNCHRONIZED);
> + PIPE_TRANSFER_UNSYNCHRONIZED |
> + RADEON_TRANSFER_TEMPORARY);
>
> /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
> * endian-independent. */
> if (prolog) {
> memcpy(ptr, prolog->code, prolog->code_size);
> ptr += prolog->code_size;
> }
> if (previous_stage) {
> memcpy(ptr, previous_stage->code,
> previous_stage->code_size);
> ptr += previous_stage->code_size;
> diff --git a/src/gallium/include/pipe/p_defines.h
> b/src/gallium/include/pipe/p_defines.h
> index 693f041b1da..e99895d30d8 100644
> --- a/src/gallium/include/pipe/p_defines.h
> +++ b/src/gallium/include/pipe/p_defines.h
> @@ -334,21 +334,27 @@ enum pipe_transfer_usage
> */
> PIPE_TRANSFER_PERSISTENT = (1 << 13),
>
> /**
> * If PERSISTENT is set, this ensures any writes done by the device are
> * immediately visible to the CPU and vice versa.
> *
> * PIPE_RESOURCE_FLAG_MAP_COHERENT must be set when creating
> * the resource.
> */
> - PIPE_TRANSFER_COHERENT = (1 << 14)
> + PIPE_TRANSFER_COHERENT = (1 << 14),
> +
> + /**
> + * This and higher bits are reserved for private use by drivers.
> Drivers
> + * should use this as (PIPE_TRANSFER_DRV_PRV << i).
> + */
> + PIPE_TRANSFER_DRV_PRV = (1 << 24)
> };
>
> /**
> * Flags for the flush function.
> */
> enum pipe_flush_flags
> {
> PIPE_FLUSH_END_OF_FRAME = (1 << 0),
> PIPE_FLUSH_DEFERRED = (1 << 1),
> PIPE_FLUSH_FENCE_FD = (1 << 2),
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 9f0d4c12482..99cd8cb31a5 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -49,20 +49,21 @@
> struct amdgpu_sparse_backing_chunk {
> uint32_t begin, end;
> };
>
> static struct pb_buffer *
> amdgpu_bo_create(struct radeon_winsys *rws,
> uint64_t size,
> unsigned alignment,
> enum radeon_bo_domain domain,
> enum radeon_bo_flag flags);
> +static void amdgpu_bo_unmap(struct pb_buffer *buf);
>
> static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
> enum radeon_bo_usage usage)
> {
> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
> struct amdgpu_winsys *ws = bo->ws;
> int64_t abs_timeout;
>
> if (timeout == 0) {
> if (p_atomic_read(&bo->num_active_ioctls))
> @@ -166,20 +167,26 @@ static void amdgpu_bo_remove_fences(struct
> amdgpu_winsys_bo *bo)
> bo->max_fences = 0;
> }
>
> void amdgpu_bo_destroy(struct pb_buffer *_buf)
> {
> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
> struct amdgpu_winsys *ws = bo->ws;
>
> assert(bo->bo && "must not be called for slab entries");
>
> + if (!bo->is_user_ptr && bo->cpu_ptr) {
> + bo->cpu_ptr = NULL;
> + amdgpu_bo_unmap(&bo->base);
> + }
> + assert(bo->is_user_ptr || bo->u.real.map_count == 0);
> +
> if (ws->debug_all_bos) {
> simple_mtx_lock(&ws->global_bo_list_lock);
> LIST_DEL(&bo->u.real.global_list_item);
> ws->num_buffers--;
> simple_mtx_unlock(&ws->global_bo_list_lock);
> }
>
> simple_mtx_lock(&ws->bo_export_table_lock);
> util_hash_table_remove(ws->bo_export_table, bo->bo);
> simple_mtx_unlock(&ws->bo_export_table_lock);
> @@ -188,54 +195,66 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
> amdgpu_va_range_free(bo->u.real.va_handle);
> amdgpu_bo_free(bo->bo);
>
> amdgpu_bo_remove_fences(bo);
>
> if (bo->initial_domain & RADEON_DOMAIN_VRAM)
> ws->allocated_vram -= align64(bo->base.size,
> ws->info.gart_page_size);
> else if (bo->initial_domain & RADEON_DOMAIN_GTT)
> ws->allocated_gtt -= align64(bo->base.size,
> ws->info.gart_page_size);
>
> - if (bo->u.real.map_count >= 1) {
> - if (bo->initial_domain & RADEON_DOMAIN_VRAM)
> - ws->mapped_vram -= bo->base.size;
> - else if (bo->initial_domain & RADEON_DOMAIN_GTT)
> - ws->mapped_gtt -= bo->base.size;
> - ws->num_mapped_buffers--;
> - }
> -
> simple_mtx_destroy(&bo->lock);
> FREE(bo);
> }
>
> static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
> {
> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
>
> assert(bo->bo); /* slab buffers have a separate vtbl */
>
> if (bo->u.real.use_reusable_pool)
> pb_cache_add_buffer(&bo->u.real.cache_entry);
> else
> amdgpu_bo_destroy(_buf);
> }
>
> +static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
> +{
> + assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
> + int r = amdgpu_bo_cpu_map(bo->bo, cpu);
> + if (r) {
> + /* Clear the cache and try again. */
> + pb_cache_release_all_buffers(&bo->ws->bo_cache);
> + r = amdgpu_bo_cpu_map(bo->bo, cpu);
> + if (r)
> + return false;
> + }
> +
> + if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
> + if (bo->initial_domain & RADEON_DOMAIN_VRAM)
> + bo->ws->mapped_vram += bo->base.size;
> + else if (bo->initial_domain & RADEON_DOMAIN_GTT)
> + bo->ws->mapped_gtt += bo->base.size;
> + bo->ws->num_mapped_buffers++;
> + }
> +
> + return true;
> +}
> +
> static void *amdgpu_bo_map(struct pb_buffer *buf,
> struct radeon_cmdbuf *rcs,
> enum pipe_transfer_usage usage)
> {
> struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
> struct amdgpu_winsys_bo *real;
> struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
> - int r;
> - void *cpu = NULL;
> - uint64_t offset = 0;
>
> assert(!bo->sparse);
>
> /* If it's not unsynchronized bo_map, flush CS if needed and then
> wait. */
> if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
> /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
> if (usage & PIPE_TRANSFER_DONTBLOCK) {
> if (!(usage & PIPE_TRANSFER_WRITE)) {
> /* Mapping for read.
> *
> @@ -306,63 +325,75 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
> }
>
> amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_READWRITE);
> }
>
> bo->ws->buffer_wait_time += os_time_get_nano() - time;
> }
> }
>
> - /* If the buffer is created from user memory, return the user pointer.
> */
> - if (bo->user_ptr)
> - return bo->user_ptr;
> + /* Buffer synchronization has been checked, now actually map the
> buffer. */
> + void *cpu = NULL;
> + uint64_t offset = 0;
>
> if (bo->bo) {
> real = bo;
> } else {
> real = bo->u.slab.real;
> offset = bo->va - real->va;
> }
>
> - r = amdgpu_bo_cpu_map(real->bo, &cpu);
> - if (r) {
> - /* Clear the cache and try again. */
> - pb_cache_release_all_buffers(&real->ws->bo_cache);
> - r = amdgpu_bo_cpu_map(real->bo, &cpu);
> - if (r)
> - return NULL;
> + if (usage & RADEON_TRANSFER_TEMPORARY) {
> + if (real->is_user_ptr) {
> + cpu = real->cpu_ptr;
> + } else {
> + if (!amdgpu_bo_do_map(real, &cpu))
> + return NULL;
> + }
> + } else {
> + cpu = p_atomic_read(&real->cpu_ptr);
> + if (!cpu) {
> + simple_mtx_lock(&real->lock);
> + /* Must re-check due to the possibility of a race. Re-check need
> not
> + * be atomic thanks to the lock. */
> + cpu = real->cpu_ptr;
> + if (!cpu) {
> + if (!amdgpu_bo_do_map(real, &cpu)) {
> + simple_mtx_unlock(&real->lock);
> + return NULL;
> + }
> + p_atomic_set(&real->cpu_ptr, cpu);
> + }
> + simple_mtx_unlock(&real->lock);
> + }
> }
>
> - if (p_atomic_inc_return(&real->u.real.map_count) == 1) {
> - if (real->initial_domain & RADEON_DOMAIN_VRAM)
> - real->ws->mapped_vram += real->base.size;
> - else if (real->initial_domain & RADEON_DOMAIN_GTT)
> - real->ws->mapped_gtt += real->base.size;
> - real->ws->num_mapped_buffers++;
> - }
> return (uint8_t*)cpu + offset;
> }
>
> static void amdgpu_bo_unmap(struct pb_buffer *buf)
> {
> struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
> struct amdgpu_winsys_bo *real;
>
> assert(!bo->sparse);
>
> - if (bo->user_ptr)
> + if (bo->is_user_ptr)
> return;
>
> real = bo->bo ? bo : bo->u.slab.real;
> -
> + assert(real->u.real.map_count != 0 && "too many unmaps");
> if (p_atomic_dec_zero(&real->u.real.map_count)) {
> + assert(!real->cpu_ptr &&
> + "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
> +
> if (real->initial_domain & RADEON_DOMAIN_VRAM)
> real->ws->mapped_vram -= real->base.size;
> else if (real->initial_domain & RADEON_DOMAIN_GTT)
> real->ws->mapped_gtt -= real->base.size;
> real->ws->num_mapped_buffers--;
> }
>
> amdgpu_bo_cpu_unmap(real->bo);
> }
>
> @@ -1446,28 +1477,29 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct
> radeon_winsys *rws,
>
> if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
> aligned_size, 1 << 12, 0, &va, &va_handle,
> AMDGPU_VA_RANGE_HIGH))
> goto error_va_alloc;
>
> if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0,
> AMDGPU_VA_OP_MAP))
> goto error_va_map;
>
> /* Initialize it. */
> + bo->is_user_ptr = true;
> pipe_reference_init(&bo->base.reference, 1);
> simple_mtx_init(&bo->lock, mtx_plain);
> bo->bo = buf_handle;
> bo->base.alignment = 0;
> bo->base.size = size;
> bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
> bo->ws = ws;
> - bo->user_ptr = pointer;
> + bo->cpu_ptr = pointer;
> bo->va = va;
> bo->u.real.va_handle = va_handle;
> bo->initial_domain = RADEON_DOMAIN_GTT;
> bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
>
> ws->allocated_gtt += aligned_size;
>
> amdgpu_add_buffer_to_global_list(bo);
>
> amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms,
> &bo->u.real.kms_handle);
> @@ -1480,21 +1512,21 @@ error_va_map:
> error_va_alloc:
> amdgpu_bo_free(buf_handle);
>
> error:
> FREE(bo);
> return NULL;
> }
>
> static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
> {
> - return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
> + return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
> }
>
> static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
> {
> struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
>
> return !bo->bo && !bo->sparse;
> }
>
> static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> index 58e6eed733d..88f4241327d 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
> @@ -81,24 +81,25 @@ struct amdgpu_winsys_bo {
> uint32_t num_backing_pages;
>
> struct list_head backing;
>
> /* Commitment information for each page of the virtual memory
> area. */
> struct amdgpu_sparse_commitment *commitments;
> } sparse;
> } u;
>
> struct amdgpu_winsys *ws;
> - void *user_ptr; /* from buffer_from_ptr */
> + void *cpu_ptr; /* for user_ptr and permanent maps */
>
> amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */
> bool sparse;
> + bool is_user_ptr;
> bool is_local;
> uint32_t unique_id;
> uint64_t va;
> enum radeon_bo_domain initial_domain;
>
> /* how many command streams is this bo referenced in? */
> int num_cs_references;
>
> /* how many command streams, which are being emitted in a separate
> * thread, is this bo referenced in? */
> --
> 2.19.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20181123/4989b532/attachment-0001.html>
More information about the mesa-dev
mailing list