<div dir="ltr"><div>Reviewed-by: Marek Olšák <<a href="mailto:marek.olsak@amd.com">marek.olsak@amd.com</a>></div><div><br></div><div>Marek</div></div><br><div class="gmail_quote"><div dir="ltr">On Thu, Nov 22, 2018 at 6:32 AM Nicolai Hähnle <<a href="mailto:nhaehnle@gmail.com">nhaehnle@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Nicolai Hähnle <<a href="mailto:nicolai.haehnle@amd.com" target="_blank">nicolai.haehnle@amd.com</a>><br>
<br>
Introduce a new driver-private transfer flag RADEON_TRANSFER_TEMPORARY<br>
that specifies whether the caller will use buffer_unmap or not. The<br>
default behavior is set to permanent maps, because that's what drivers<br>
do for Gallium buffer maps.<br>
<br>
This should eliminate the need for hacks in libdrm. Assertions are added<br>
to catch when the buffer_unmap calls don't match the (temporary)<br>
buffer_map calls.<br>
<br>
I did my best to update r600 for consistency (r300 needs no changes<br>
because it never calls buffer_unmap), even though the radeon winsys<br>
ignores the new flag.<br>
<br>
As an added bonus, this should actually improve the performance of<br>
the normal fast path, because we no longer call into libdrm at all<br>
after the first map, and there's one less atomic in the winsys itself<br>
(there are now no atomics left in the UNSYNCHRONIZED fast path).<br>
<br>
Cc: Leo Liu <<a href="mailto:leo.liu@amd.com" target="_blank">leo.liu@amd.com</a>><br>
v2:<br>
- remove comment about visible VRAM (Marek)<br>
- don't rely on amdgpu_bo_cpu_map doing an atomic write<br>
---<br>
 src/gallium/drivers/r600/evergreen_compute.c |  4 +-<br>
 src/gallium/drivers/r600/r600_asm.c          |  4 +-<br>
 src/gallium/drivers/r600/r600_shader.c       |  4 +-<br>
 src/gallium/drivers/r600/radeon_uvd.c        |  8 +-<br>
 src/gallium/drivers/r600/radeon_vce.c        |  4 +-<br>
 src/gallium/drivers/r600/radeon_video.c      |  6 +-<br>
 src/gallium/drivers/radeon/radeon_uvd.c      | 10 +-<br>
 src/gallium/drivers/radeon/radeon_uvd_enc.c  |  6 +-<br>
 src/gallium/drivers/radeon/radeon_vce.c      |  4 +-<br>
 src/gallium/drivers/radeon/radeon_vcn_dec.c  | 18 ++--<br>
 src/gallium/drivers/radeon/radeon_vcn_enc.c  |  4 +-<br>
 src/gallium/drivers/radeon/radeon_video.c    |  6 +-<br>
 src/gallium/drivers/radeon/radeon_winsys.h   | 14 ++-<br>
 src/gallium/drivers/radeonsi/si_shader.c     |  3 +-<br>
 src/gallium/include/pipe/p_defines.h         |  8 +-<br>
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c    | 96 +++++++++++++-------<br>
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.h    |  3 +-<br>
 17 files changed, 140 insertions(+), 62 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c<br>
index a77f58242e3..9085be4e2f3 100644<br>
--- a/src/gallium/drivers/r600/evergreen_compute.c<br>
+++ b/src/gallium/drivers/r600/evergreen_compute.c<br>
@@ -431,21 +431,23 @@ static void *evergreen_create_compute_state(struct pipe_context *ctx,<br>
        COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");<br>
        header = cso->prog;<br>
        code = cso->prog + sizeof(struct pipe_llvm_program_header);<br>
        radeon_shader_binary_init(&shader->binary);<br>
        r600_elf_read(code, header->num_bytes, &shader->binary);<br>
        r600_create_shader(&shader->bc, &shader->binary, &use_kill);<br>
<br>
        /* Upload code + ROdata */<br>
        shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,<br>
                                                        shader->bc.ndw * 4);<br>
-       p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);<br>
+       p = r600_buffer_map_sync_with_rings(<br>
+               &rctx->b, shader->code_bo,<br>
+               PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
        //TODO: use util_memcpy_cpu_to_le32 ?<br>
        memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);<br>
        rctx->b.ws->buffer_unmap(shader->code_bo->buf);<br>
 #endif<br>
<br>
        return shader;<br>
 }<br>
<br>
 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)<br>
 {<br>
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c<br>
index 7029be24f4b..4ba77c535f9 100644<br>
--- a/src/gallium/drivers/r600/r600_asm.c<br>
+++ b/src/gallium/drivers/r600/r600_asm.c<br>
@@ -2765,21 +2765,23 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,<br>
<br>
        u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256,<br>
                             &shader->offset,<br>
                             (struct pipe_resource**)&shader->buffer);<br>
        if (!shader->buffer) {<br>
                r600_bytecode_clear(&bc);<br>
                FREE(shader);<br>
                return NULL;<br>
        }<br>
<br>
-       bytecode = r600_buffer_map_sync_with_rings(&rctx->b, shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);<br>
+       bytecode = r600_buffer_map_sync_with_rings<br>
+               (&rctx->b, shader->buffer,<br>
+               PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);<br>
        bytecode += shader->offset / 4;<br>
<br>
        if (R600_BIG_ENDIAN) {<br>
                for (i = 0; i < fs_size / 4; ++i) {<br>
                        bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);<br>
                }<br>
        } else {<br>
                memcpy(bytecode, bc.bytecode, fs_size);<br>
        }<br>
        rctx->b.ws->buffer_unmap(shader->buffer->buf);<br>
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c<br>
index 408939d1105..fc826470d69 100644<br>
--- a/src/gallium/drivers/r600/r600_shader.c<br>
+++ b/src/gallium/drivers/r600/r600_shader.c<br>
@@ -134,21 +134,23 @@ static int store_shader(struct pipe_context *ctx,<br>
 {<br>
        struct r600_context *rctx = (struct r600_context *)ctx;<br>
        uint32_t *ptr, i;<br>
<br>
        if (shader->bo == NULL) {<br>
                shader->bo = (struct r600_resource*)<br>
                        pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);<br>
                if (shader->bo == NULL) {<br>
                        return -ENOMEM;<br>
                }<br>
-               ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);<br>
+               ptr = r600_buffer_map_sync_with_rings(<br>
+                       &rctx->b, shader->bo,<br>
+                       PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                if (R600_BIG_ENDIAN) {<br>
                        for (i = 0; i < shader->shader.bc.ndw; ++i) {<br>
                                ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);<br>
                        }<br>
                } else {<br>
                        memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));<br>
                }<br>
                rctx->b.ws->buffer_unmap(shader->bo->buf);<br>
        }<br>
<br>
diff --git a/src/gallium/drivers/r600/radeon_uvd.c b/src/gallium/drivers/r600/radeon_uvd.c<br>
index 495a93dc55a..5568f2138e4 100644<br>
--- a/src/gallium/drivers/r600/radeon_uvd.c<br>
+++ b/src/gallium/drivers/r600/radeon_uvd.c<br>
@@ -145,21 +145,22 @@ static bool have_it(struct ruvd_decoder *dec)<br>
 /* map the next available message/feedback/itscaling buffer */<br>
 static void map_msg_fb_it_buf(struct ruvd_decoder *dec)<br>
 {<br>
        struct rvid_buffer* buf;<br>
        uint8_t *ptr;<br>
<br>
        /* grab the current message/feedback buffer */<br>
        buf = &dec->msg_fb_it_buffers[dec->cur_buffer];<br>
<br>
        /* and map it for CPU access */<br>
-       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);<br>
+       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
+                                  PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
        /* calc buffer offsets */<br>
        dec->msg = (struct ruvd_msg *)ptr;<br>
        memset(dec->msg, 0, sizeof(*dec->msg));<br>
<br>
        dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);<br>
        if (have_it(dec))<br>
                dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size);<br>
 }<br>
<br>
@@ -1061,21 +1062,21 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder,<br>
<br>
        assert(decoder);<br>
<br>
        frame = ++dec->frame_number;<br>
        vl_video_buffer_set_associated_data(target, decoder, (void *)frame,<br>
                                            &ruvd_destroy_associated_data);<br>
<br>
        dec->bs_size = 0;<br>
        dec->bs_ptr = dec->ws->buffer_map(<br>
                dec->bs_buffers[dec->cur_buffer].res->buf,<br>
-               dec->cs, PIPE_TRANSFER_WRITE);<br>
+               dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
 }<br>
<br>
 /**<br>
  * decode a macroblock<br>
  */<br>
 static void ruvd_decode_macroblock(struct pipe_video_codec *decoder,<br>
                                   struct pipe_video_buffer *target,<br>
                                   struct pipe_picture_desc *picture,<br>
                                   const struct pipe_macroblock *macroblocks,<br>
                                   unsigned num_macroblocks)<br>
@@ -1114,21 +1115,22 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder,<br>
                        new_size += 2; /* save for EOI */<br>
<br>
                if (new_size > buf->res->buf->size) {<br>
                        dec->ws->buffer_unmap(buf->res->buf);<br>
                        if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {<br>
                                RVID_ERR("Can't resize bitstream buffer!");<br>
                                return;<br>
                        }<br>
<br>
                        dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
-                                                         PIPE_TRANSFER_WRITE);<br>
+                                                         PIPE_TRANSFER_WRITE |<br>
+                                                         RADEON_TRANSFER_TEMPORARY);<br>
                        if (!dec->bs_ptr)<br>
                                return;<br>
<br>
                        dec->bs_ptr += dec->bs_size;<br>
                }<br>
<br>
                memcpy(dec->bs_ptr, buffers[i], sizes[i]);<br>
                dec->bs_size += sizes[i];<br>
                dec->bs_ptr += sizes[i];<br>
        }<br>
diff --git a/src/gallium/drivers/r600/radeon_vce.c b/src/gallium/drivers/r600/radeon_vce.c<br>
index 60ba12a593a..e38b927b1d4 100644<br>
--- a/src/gallium/drivers/r600/radeon_vce.c<br>
+++ b/src/gallium/drivers/r600/radeon_vce.c<br>
@@ -346,21 +346,23 @@ static void rvce_end_frame(struct pipe_video_codec *encoder,<br>
        }<br>
 }<br>
<br>
 static void rvce_get_feedback(struct pipe_video_codec *encoder,<br>
                              void *feedback, unsigned *size)<br>
 {<br>
        struct rvce_encoder *enc = (struct rvce_encoder*)encoder;<br>
        struct rvid_buffer *fb = feedback;<br>
<br>
        if (size) {<br>
-               uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);<br>
+               uint32_t *ptr = enc->ws->buffer_map(<br>
+                       fb->res->buf, enc->cs,<br>
+                       PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
                if (ptr[1]) {<br>
                        *size = ptr[4] - ptr[9];<br>
                } else {<br>
                        *size = 0;<br>
                }<br>
<br>
                enc->ws->buffer_unmap(fb->res->buf);<br>
        }<br>
        //dump_feedback(enc, fb);<br>
diff --git a/src/gallium/drivers/r600/radeon_video.c b/src/gallium/drivers/r600/radeon_video.c<br>
index 02fcf77d4ff..8e0af448be5 100644<br>
--- a/src/gallium/drivers/r600/radeon_video.c<br>
+++ b/src/gallium/drivers/r600/radeon_video.c<br>
@@ -90,25 +90,27 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,<br>
 {<br>
        struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;<br>
        struct radeon_winsys* ws = rscreen->ws;<br>
        unsigned bytes = MIN2(new_buf->res->buf->size, new_size);<br>
        struct rvid_buffer old_buf = *new_buf;<br>
        void *src = NULL, *dst = NULL;<br>
<br>
        if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage))<br>
                goto error;<br>
<br>
-       src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);<br>
+       src = ws->buffer_map(old_buf.res->buf, cs,<br>
+                            PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY);<br>
        if (!src)<br>
                goto error;<br>
<br>
-       dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);<br>
+       dst = ws->buffer_map(new_buf->res->buf, cs,<br>
+                            PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
        if (!dst)<br>
                goto error;<br>
<br>
        memcpy(dst, src, bytes);<br>
        if (new_size > bytes) {<br>
                new_size -= bytes;<br>
                dst += bytes;<br>
                memset(dst, 0, new_size);<br>
        }<br>
        ws->buffer_unmap(new_buf->res->buf);<br>
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c<br>
index 62af1a311c2..ca066e89823 100644<br>
--- a/src/gallium/drivers/radeon/radeon_uvd.c<br>
+++ b/src/gallium/drivers/radeon/radeon_uvd.c<br>
@@ -141,21 +141,22 @@ static bool have_it(struct ruvd_decoder *dec)<br>
 /* map the next available message/feedback/itscaling buffer */<br>
 static void map_msg_fb_it_buf(struct ruvd_decoder *dec)<br>
 {<br>
        struct rvid_buffer* buf;<br>
        uint8_t *ptr;<br>
<br>
        /* grab the current message/feedback buffer */<br>
        buf = &dec->msg_fb_it_buffers[dec->cur_buffer];<br>
<br>
        /* and map it for CPU access */<br>
-       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);<br>
+       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
+                                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
        /* calc buffer offsets */<br>
        dec->msg = (struct ruvd_msg *)ptr;<br>
        memset(dec->msg, 0, sizeof(*dec->msg));<br>
<br>
        dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);<br>
        if (have_it(dec))<br>
                dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size);<br>
 }<br>
<br>
@@ -1008,21 +1009,21 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder,<br>
<br>
        assert(decoder);<br>
<br>
        frame = ++dec->frame_number;<br>
        vl_video_buffer_set_associated_data(target, decoder, (void *)frame,<br>
                                            &ruvd_destroy_associated_data);<br>
<br>
        dec->bs_size = 0;<br>
        dec->bs_ptr = dec->ws->buffer_map(<br>
                dec->bs_buffers[dec->cur_buffer].res->buf,<br>
-               dec->cs, PIPE_TRANSFER_WRITE);<br>
+               dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
 }<br>
<br>
 /**<br>
  * decode a macroblock<br>
  */<br>
 static void ruvd_decode_macroblock(struct pipe_video_codec *decoder,<br>
                                   struct pipe_video_buffer *target,<br>
                                   struct pipe_picture_desc *picture,<br>
                                   const struct pipe_macroblock *macroblocks,<br>
                                   unsigned num_macroblocks)<br>
@@ -1053,22 +1054,23 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder,<br>
                struct rvid_buffer *buf = &dec->bs_buffers[dec->cur_buffer];<br>
                unsigned new_size = dec->bs_size + sizes[i];<br>
<br>
                if (new_size > buf->res->buf->size) {<br>
                        dec->ws->buffer_unmap(buf->res->buf);<br>
                        if (!si_vid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {<br>
                                RVID_ERR("Can't resize bitstream buffer!");<br>
                                return;<br>
                        }<br>
<br>
-                       dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
-                                                         PIPE_TRANSFER_WRITE);<br>
+                       dec->bs_ptr = dec->ws->buffer_map(<br>
+                               buf->res->buf, dec->cs,<br>
+                               PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                        if (!dec->bs_ptr)<br>
                                return;<br>
<br>
                        dec->bs_ptr += dec->bs_size;<br>
                }<br>
<br>
                memcpy(dec->bs_ptr, buffers[i], sizes[i]);<br>
                dec->bs_size += sizes[i];<br>
                dec->bs_ptr += sizes[i];<br>
        }<br>
diff --git a/src/gallium/drivers/radeon/radeon_uvd_enc.c b/src/gallium/drivers/radeon/radeon_uvd_enc.c<br>
index 4384e5e1646..3164dbb2c20 100644<br>
--- a/src/gallium/drivers/radeon/radeon_uvd_enc.c<br>
+++ b/src/gallium/drivers/radeon/radeon_uvd_enc.c<br>
@@ -256,23 +256,23 @@ radeon_uvd_enc_destroy(struct pipe_video_codec *encoder)<br>
<br>
 static void<br>
 radeon_uvd_enc_get_feedback(struct pipe_video_codec *encoder,<br>
                             void *feedback, unsigned *size)<br>
 {<br>
    struct radeon_uvd_encoder *enc = (struct radeon_uvd_encoder *) encoder;<br>
    struct rvid_buffer *fb = feedback;<br>
<br>
    if (NULL != size) {<br>
       radeon_uvd_enc_feedback_t *fb_data =<br>
-         (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map(fb->res->buf,<br>
-                                                           enc->cs,<br>
-                                                           PIPE_TRANSFER_READ_WRITE);<br>
+         (radeon_uvd_enc_feedback_t *) enc->ws->buffer_map(<br>
+               fb->res->buf, enc->cs,<br>
+               PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
       if (!fb_data->status)<br>
          *size = fb_data->bitstream_size;<br>
       else<br>
          *size = 0;<br>
       enc->ws->buffer_unmap(fb->res->buf);<br>
    }<br>
<br>
    si_vid_destroy_buffer(fb);<br>
    FREE(fb);<br>
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c<br>
index 310d1654b05..94df06e88c6 100644<br>
--- a/src/gallium/drivers/radeon/radeon_vce.c<br>
+++ b/src/gallium/drivers/radeon/radeon_vce.c<br>
@@ -345,21 +345,23 @@ static void rvce_end_frame(struct pipe_video_codec *encoder,<br>
        }<br>
 }<br>
<br>
 static void rvce_get_feedback(struct pipe_video_codec *encoder,<br>
                              void *feedback, unsigned *size)<br>
 {<br>
        struct rvce_encoder *enc = (struct rvce_encoder*)encoder;<br>
        struct rvid_buffer *fb = feedback;<br>
<br>
        if (size) {<br>
-               uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);<br>
+               uint32_t *ptr = enc->ws->buffer_map(<br>
+                       fb->res->buf, enc->cs,<br>
+                       PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
                if (ptr[1]) {<br>
                        *size = ptr[4] - ptr[9];<br>
                } else {<br>
                        *size = 0;<br>
                }<br>
<br>
                enc->ws->buffer_unmap(fb->res->buf);<br>
        }<br>
        //dump_feedback(enc, fb);<br>
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c<br>
index 1ee85ae3d3f..e402af21a64 100644<br>
--- a/src/gallium/drivers/radeon/radeon_vcn_dec.c<br>
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c<br>
@@ -934,21 +934,23 @@ static struct pb_buffer *rvcn_dec_message_decode(struct radeon_decoder *dec,<br>
                        ctx_size += 8 * 2 * 4096;<br>
<br>
                        if (dec->base.profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)<br>
                                ctx_size += 8 * 2 * 4096;<br>
<br>
                        if (!si_vid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT))<br>
                                RVID_ERR("Can't allocated context buffer.\n");<br>
                        si_vid_clear_buffer(dec->base.context, &dec->ctx);<br>
<br>
                        /* ctx needs probs table */<br>
-                       ptr = dec->ws->buffer_map(dec->ctx.res->buf, dec->cs, PIPE_TRANSFER_WRITE);<br>
+                       ptr = dec->ws->buffer_map(<br>
+                               dec->ctx.res->buf, dec->cs,<br>
+                               PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                        fill_probs_table(ptr);<br>
                        dec->ws->buffer_unmap(dec->ctx.res->buf);<br>
                }<br>
                break;<br>
        }<br>
        default:<br>
                assert(0);<br>
                return NULL;<br>
        }<br>
<br>
@@ -1027,21 +1029,22 @@ static bool have_probs(struct radeon_decoder *dec)<br>
 /* map the next available message/feedback/itscaling buffer */<br>
 static void map_msg_fb_it_probs_buf(struct radeon_decoder *dec)<br>
 {<br>
        struct rvid_buffer* buf;<br>
        uint8_t *ptr;<br>
<br>
        /* grab the current message/feedback buffer */<br>
        buf = &dec->msg_fb_it_probs_buffers[dec->cur_buffer];<br>
<br>
        /* and map it for CPU access */<br>
-       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);<br>
+       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
+                                 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
<br>
        /* calc buffer offsets */<br>
        dec->msg = ptr;<br>
<br>
        dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);<br>
        if (have_it(dec))<br>
                dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);<br>
        else if (have_probs(dec))<br>
                dec->probs = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);<br>
 }<br>
@@ -1305,21 +1308,21 @@ static void radeon_dec_begin_frame(struct pipe_video_codec *decoder,<br>
        assert(decoder);<br>
<br>
        frame = ++dec->frame_number;<br>
        if (dec->stream_type != RDECODE_CODEC_VP9)<br>
                vl_video_buffer_set_associated_data(target, decoder, (void *)frame,<br>
                                            &radeon_dec_destroy_associated_data);<br>
<br>
        dec->bs_size = 0;<br>
        dec->bs_ptr = dec->ws->buffer_map(<br>
                dec->bs_buffers[dec->cur_buffer].res->buf,<br>
-               dec->cs, PIPE_TRANSFER_WRITE);<br>
+               dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
 }<br>
<br>
 /**<br>
  * decode a macroblock<br>
  */<br>
 static void radeon_dec_decode_macroblock(struct pipe_video_codec *decoder,<br>
                                   struct pipe_video_buffer *target,<br>
                                   struct pipe_picture_desc *picture,<br>
                                   const struct pipe_macroblock *macroblocks,<br>
                                   unsigned num_macroblocks)<br>
@@ -1350,22 +1353,23 @@ static void radeon_dec_decode_bitstream(struct pipe_video_codec *decoder,<br>
                struct rvid_buffer *buf = &dec->bs_buffers[dec->cur_buffer];<br>
                unsigned new_size = dec->bs_size + sizes[i];<br>
<br>
                if (new_size > buf->res->buf->size) {<br>
                        dec->ws->buffer_unmap(buf->res->buf);<br>
                        if (!si_vid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {<br>
                                RVID_ERR("Can't resize bitstream buffer!");<br>
                                return;<br>
                        }<br>
<br>
-                       dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,<br>
-                                                         PIPE_TRANSFER_WRITE);<br>
+                       dec->bs_ptr = dec->ws->buffer_map(<br>
+                               buf->res->buf, dec->cs,<br>
+                               PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                        if (!dec->bs_ptr)<br>
                                return;<br>
<br>
                        dec->bs_ptr += dec->bs_size;<br>
                }<br>
<br>
                memcpy(dec->bs_ptr, buffers[i], sizes[i]);<br>
                dec->bs_size += sizes[i];<br>
                dec->bs_ptr += sizes[i];<br>
        }<br>
@@ -1536,21 +1540,23 @@ struct pipe_video_codec *radeon_create_decoder(struct pipe_context *context,<br>
                }<br>
<br>
                si_vid_clear_buffer(context, &dec->msg_fb_it_probs_buffers[i]);<br>
                si_vid_clear_buffer(context, &dec->bs_buffers[i]);<br>
<br>
                if (have_probs(dec)) {<br>
                        struct rvid_buffer* buf;<br>
                        void *ptr;<br>
<br>
                        buf = &dec->msg_fb_it_probs_buffers[i];<br>
-                       ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);<br>
+                       ptr = dec->ws->buffer_map(<br>
+                               buf->res->buf, dec->cs,<br>
+                               PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                        ptr += FB_BUFFER_OFFSET + FB_BUFFER_SIZE;<br>
                        fill_probs_table(ptr);<br>
                        dec->ws->buffer_unmap(buf->res->buf);<br>
                }<br>
        }<br>
<br>
        dpb_size = calc_dpb_size(dec);<br>
        if (dpb_size) {<br>
                if (!si_vid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) {<br>
                        RVID_ERR("Can't allocated dpb.\n");<br>
diff --git a/src/gallium/drivers/radeon/radeon_vcn_enc.c b/src/gallium/drivers/radeon/radeon_vcn_enc.c<br>
index e8676f6c721..7d64a28a405 100644<br>
--- a/src/gallium/drivers/radeon/radeon_vcn_enc.c<br>
+++ b/src/gallium/drivers/radeon/radeon_vcn_enc.c<br>
@@ -237,21 +237,23 @@ static void radeon_enc_destroy(struct pipe_video_codec *encoder)<br>
        FREE(enc);<br>
 }<br>
<br>
 static void radeon_enc_get_feedback(struct pipe_video_codec *encoder,<br>
                                                          void *feedback, unsigned *size)<br>
 {<br>
        struct radeon_encoder *enc = (struct radeon_encoder*)encoder;<br>
        struct rvid_buffer *fb = feedback;<br>
<br>
        if (size) {<br>
-               uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);<br>
+               uint32_t *ptr = enc->ws->buffer_map(<br>
+                       fb->res->buf, enc->cs,<br>
+                       PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
                if (ptr[1])<br>
                        *size = ptr[6];<br>
                else<br>
                        *size = 0;<br>
                enc->ws->buffer_unmap(fb->res->buf);<br>
        }<br>
<br>
        si_vid_destroy_buffer(fb);<br>
        FREE(fb);<br>
 }<br>
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c<br>
index a39ce4cc73e..bb1173e8005 100644<br>
--- a/src/gallium/drivers/radeon/radeon_video.c<br>
+++ b/src/gallium/drivers/radeon/radeon_video.c<br>
@@ -81,25 +81,27 @@ bool si_vid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,<br>
 {<br>
        struct si_screen *sscreen = (struct si_screen *)screen;<br>
        struct radeon_winsys* ws = sscreen->ws;<br>
        unsigned bytes = MIN2(new_buf->res->buf->size, new_size);<br>
        struct rvid_buffer old_buf = *new_buf;<br>
        void *src = NULL, *dst = NULL;<br>
<br>
        if (!si_vid_create_buffer(screen, new_buf, new_size, new_buf->usage))<br>
                goto error;<br>
<br>
-       src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);<br>
+       src = ws->buffer_map(old_buf.res->buf, cs,<br>
+                            PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY);<br>
        if (!src)<br>
                goto error;<br>
<br>
-       dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);<br>
+       dst = ws->buffer_map(new_buf->res->buf, cs,<br>
+                            PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);<br>
        if (!dst)<br>
                goto error;<br>
<br>
        memcpy(dst, src, bytes);<br>
        if (new_size > bytes) {<br>
                new_size -= bytes;<br>
                dst += bytes;<br>
                memset(dst, 0, new_size);<br>
        }<br>
        ws->buffer_unmap(new_buf->res->buf);<br>
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h<br>
index 49f8bb279e5..a56ff75ad24 100644<br>
--- a/src/gallium/drivers/radeon/radeon_winsys.h<br>
+++ b/src/gallium/drivers/radeon/radeon_winsys.h<br>
@@ -69,20 +69,29 @@ enum radeon_bo_usage { /* bitfield */<br>
     RADEON_USAGE_READ = 2,<br>
     RADEON_USAGE_WRITE = 4,<br>
     RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,<br>
<br>
     /* The winsys ensures that the CS submission will be scheduled after<br>
      * previously flushed CSs referencing this BO in a conflicting way.<br>
      */<br>
     RADEON_USAGE_SYNCHRONIZED = 8<br>
 };<br>
<br>
+enum radeon_transfer_flags {<br>
+   /* Indicates that the caller will unmap the buffer.<br>
+    *<br>
+    * Not unmapping buffers is an important performance optimization for<br>
+    * OpenGL (avoids kernel overhead for frequently mapped buffers).<br>
+    */<br>
+   RADEON_TRANSFER_TEMPORARY = (PIPE_TRANSFER_DRV_PRV << 0),<br>
+};<br>
+<br>
 #define RADEON_SPARSE_PAGE_SIZE (64 * 1024)<br>
<br>
 enum ring_type {<br>
     RING_GFX = 0,<br>
     RING_COMPUTE,<br>
     RING_DMA,<br>
     RING_UVD,<br>
     RING_VCE,<br>
     RING_UVD_ENC,<br>
     RING_VCN_DEC,<br>
@@ -287,23 +296,26 @@ struct radeon_winsys {<br>
     struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws,<br>
                                        uint64_t size,<br>
                                        unsigned alignment,<br>
                                        enum radeon_bo_domain domain,<br>
                                        enum radeon_bo_flag flags);<br>
<br>
     /**<br>
      * Map the entire data store of a buffer object into the client's address<br>
      * space.<br>
      *<br>
+     * Callers are expected to unmap buffers again if and only if the<br>
+     * RADEON_TRANSFER_TEMPORARY flag is set in \p usage.<br>
+     *<br>
      * \param buf       A winsys buffer object to map.<br>
      * \param cs        A command stream to flush if the buffer is referenced by it.<br>
-     * \param usage     A bitmask of the PIPE_TRANSFER_* flags.<br>
+     * \param usage     A bitmask of the PIPE_TRANSFER_* and RADEON_TRANSFER_* flags.<br>
      * \return          The pointer at the beginning of the buffer.<br>
      */<br>
     void *(*buffer_map)(struct pb_buffer *buf,<br>
                         struct radeon_cmdbuf *cs,<br>
                         enum pipe_transfer_usage usage);<br>
<br>
     /**<br>
      * Unmap a buffer object from the client's address space.<br>
      *<br>
      * \param buf       A winsys buffer object to unmap.<br>
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c<br>
index 19522cc97b1..d455fb5db6a 100644<br>
--- a/src/gallium/drivers/radeonsi/si_shader.c<br>
+++ b/src/gallium/drivers/radeonsi/si_shader.c<br>
@@ -5286,21 +5286,22 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)<br>
                                                0 : SI_RESOURCE_FLAG_READ_ONLY,<br>
                                               PIPE_USAGE_IMMUTABLE,<br>
                                               align(bo_size, SI_CPDMA_ALIGNMENT),<br>
                                               256);<br>
        if (!shader->bo)<br>
                return -ENOMEM;<br>
<br>
        /* Upload. */<br>
        ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,<br>
                                        PIPE_TRANSFER_READ_WRITE |<br>
-                                       PIPE_TRANSFER_UNSYNCHRONIZED);<br>
+                                       PIPE_TRANSFER_UNSYNCHRONIZED |<br>
+                                       RADEON_TRANSFER_TEMPORARY);<br>
<br>
        /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are<br>
         * endian-independent. */<br>
        if (prolog) {<br>
                memcpy(ptr, prolog->code, prolog->code_size);<br>
                ptr += prolog->code_size;<br>
        }<br>
        if (previous_stage) {<br>
                memcpy(ptr, previous_stage->code, previous_stage->code_size);<br>
                ptr += previous_stage->code_size;<br>
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h<br>
index 693f041b1da..e99895d30d8 100644<br>
--- a/src/gallium/include/pipe/p_defines.h<br>
+++ b/src/gallium/include/pipe/p_defines.h<br>
@@ -334,21 +334,27 @@ enum pipe_transfer_usage<br>
     */<br>
    PIPE_TRANSFER_PERSISTENT = (1 << 13),<br>
<br>
    /**<br>
     * If PERSISTENT is set, this ensures any writes done by the device are<br>
     * immediately visible to the CPU and vice versa.<br>
     *<br>
     * PIPE_RESOURCE_FLAG_MAP_COHERENT must be set when creating<br>
     * the resource.<br>
     */<br>
-   PIPE_TRANSFER_COHERENT = (1 << 14)<br>
+   PIPE_TRANSFER_COHERENT = (1 << 14),<br>
+<br>
+   /**<br>
+    * This and higher bits are reserved for private use by drivers. Drivers<br>
+    * should use this as (PIPE_TRANSFER_DRV_PRV << i).<br>
+    */<br>
+   PIPE_TRANSFER_DRV_PRV = (1 << 24)<br>
 };<br>
<br>
 /**<br>
  * Flags for the flush function.<br>
  */<br>
 enum pipe_flush_flags<br>
 {<br>
    PIPE_FLUSH_END_OF_FRAME = (1 << 0),<br>
    PIPE_FLUSH_DEFERRED = (1 << 1),<br>
    PIPE_FLUSH_FENCE_FD = (1 << 2),<br>
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
index 9f0d4c12482..99cd8cb31a5 100644<br>
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
@@ -49,20 +49,21 @@<br>
 struct amdgpu_sparse_backing_chunk {<br>
    uint32_t begin, end;<br>
 };<br>
<br>
 static struct pb_buffer *<br>
 amdgpu_bo_create(struct radeon_winsys *rws,<br>
                  uint64_t size,<br>
                  unsigned alignment,<br>
                  enum radeon_bo_domain domain,<br>
                  enum radeon_bo_flag flags);<br>
+static void amdgpu_bo_unmap(struct pb_buffer *buf);<br>
<br>
 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,<br>
                            enum radeon_bo_usage usage)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);<br>
    struct amdgpu_winsys *ws = bo->ws;<br>
    int64_t abs_timeout;<br>
<br>
    if (timeout == 0) {<br>
       if (p_atomic_read(&bo->num_active_ioctls))<br>
@@ -166,20 +167,26 @@ static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)<br>
    bo->max_fences = 0;<br>
 }<br>
<br>
 void amdgpu_bo_destroy(struct pb_buffer *_buf)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);<br>
    struct amdgpu_winsys *ws = bo->ws;<br>
<br>
    assert(bo->bo && "must not be called for slab entries");<br>
<br>
+   if (!bo->is_user_ptr && bo->cpu_ptr) {<br>
+      bo->cpu_ptr = NULL;<br>
+      amdgpu_bo_unmap(&bo->base);<br>
+   }<br>
+   assert(bo->is_user_ptr || bo->u.real.map_count == 0);<br>
+<br>
    if (ws->debug_all_bos) {<br>
       simple_mtx_lock(&ws->global_bo_list_lock);<br>
       LIST_DEL(&bo->u.real.global_list_item);<br>
       ws->num_buffers--;<br>
       simple_mtx_unlock(&ws->global_bo_list_lock);<br>
    }<br>
<br>
    simple_mtx_lock(&ws->bo_export_table_lock);<br>
    util_hash_table_remove(ws->bo_export_table, bo->bo);<br>
    simple_mtx_unlock(&ws->bo_export_table_lock);<br>
@@ -188,54 +195,66 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)<br>
    amdgpu_va_range_free(bo->u.real.va_handle);<br>
    amdgpu_bo_free(bo->bo);<br>
<br>
    amdgpu_bo_remove_fences(bo);<br>
<br>
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)<br>
       ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);<br>
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)<br>
       ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);<br>
<br>
-   if (bo->u.real.map_count >= 1) {<br>
-      if (bo->initial_domain & RADEON_DOMAIN_VRAM)<br>
-         ws->mapped_vram -= bo->base.size;<br>
-      else if (bo->initial_domain & RADEON_DOMAIN_GTT)<br>
-         ws->mapped_gtt -= bo->base.size;<br>
-      ws->num_mapped_buffers--;<br>
-   }<br>
-<br>
    simple_mtx_destroy(&bo->lock);<br>
    FREE(bo);<br>
 }<br>
<br>
 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);<br>
<br>
    assert(bo->bo); /* slab buffers have a separate vtbl */<br>
<br>
    if (bo->u.real.use_reusable_pool)<br>
       pb_cache_add_buffer(&bo->u.real.cache_entry);<br>
    else<br>
       amdgpu_bo_destroy(_buf);<br>
 }<br>
<br>
+static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)<br>
+{<br>
+   assert(!bo->sparse && bo->bo && !bo->is_user_ptr);<br>
+   int r = amdgpu_bo_cpu_map(bo->bo, cpu);<br>
+   if (r) {<br>
+      /* Clear the cache and try again. */<br>
+      pb_cache_release_all_buffers(&bo->ws->bo_cache);<br>
+      r = amdgpu_bo_cpu_map(bo->bo, cpu);<br>
+      if (r)<br>
+         return false;<br>
+   }<br>
+<br>
+   if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {<br>
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)<br>
+         bo->ws->mapped_vram += bo->base.size;<br>
+      else if (bo->initial_domain & RADEON_DOMAIN_GTT)<br>
+         bo->ws->mapped_gtt += bo->base.size;<br>
+      bo->ws->num_mapped_buffers++;<br>
+   }<br>
+<br>
+   return true;<br>
+}<br>
+<br>
 static void *amdgpu_bo_map(struct pb_buffer *buf,<br>
                            struct radeon_cmdbuf *rcs,<br>
                            enum pipe_transfer_usage usage)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;<br>
    struct amdgpu_winsys_bo *real;<br>
    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;<br>
-   int r;<br>
-   void *cpu = NULL;<br>
-   uint64_t offset = 0;<br>
<br>
    assert(!bo->sparse);<br>
<br>
    /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */<br>
    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {<br>
       /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */<br>
       if (usage & PIPE_TRANSFER_DONTBLOCK) {<br>
          if (!(usage & PIPE_TRANSFER_WRITE)) {<br>
             /* Mapping for read.<br>
              *<br>
@@ -306,63 +325,75 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,<br>
             }<br>
<br>
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,<br>
                            RADEON_USAGE_READWRITE);<br>
          }<br>
<br>
          bo->ws->buffer_wait_time += os_time_get_nano() - time;<br>
       }<br>
    }<br>
<br>
-   /* If the buffer is created from user memory, return the user pointer. */<br>
-   if (bo->user_ptr)<br>
-      return bo->user_ptr;<br>
+   /* Buffer synchronization has been checked, now actually map the buffer. */<br>
+   void *cpu = NULL;<br>
+   uint64_t offset = 0;<br>
<br>
    if (bo->bo) {<br>
       real = bo;<br>
    } else {<br>
       real = bo->u.slab.real;<br>
       offset = bo->va - real->va;<br>
    }<br>
<br>
-   r = amdgpu_bo_cpu_map(real->bo, &cpu);<br>
-   if (r) {<br>
-      /* Clear the cache and try again. */<br>
-      pb_cache_release_all_buffers(&real->ws->bo_cache);<br>
-      r = amdgpu_bo_cpu_map(real->bo, &cpu);<br>
-      if (r)<br>
-         return NULL;<br>
+   if (usage & RADEON_TRANSFER_TEMPORARY) {<br>
+      if (real->is_user_ptr) {<br>
+         cpu = real->cpu_ptr;<br>
+      } else {<br>
+         if (!amdgpu_bo_do_map(real, &cpu))<br>
+            return NULL;<br>
+      }<br>
+   } else {<br>
+      cpu = p_atomic_read(&real->cpu_ptr);<br>
+      if (!cpu) {<br>
+         simple_mtx_lock(&real->lock);<br>
+         /* Must re-check due to the possibility of a race. Re-check need not<br>
+          * be atomic thanks to the lock. */<br>
+         cpu = real->cpu_ptr;<br>
+         if (!cpu) {<br>
+            if (!amdgpu_bo_do_map(real, &cpu)) {<br>
+               simple_mtx_unlock(&real->lock);<br>
+               return NULL;<br>
+            }<br>
+            p_atomic_set(&real->cpu_ptr, cpu);<br>
+         }<br>
+         simple_mtx_unlock(&real->lock);<br>
+      }<br>
    }<br>
<br>
-   if (p_atomic_inc_return(&real->u.real.map_count) == 1) {<br>
-      if (real->initial_domain & RADEON_DOMAIN_VRAM)<br>
-         real->ws->mapped_vram += real->base.size;<br>
-      else if (real->initial_domain & RADEON_DOMAIN_GTT)<br>
-         real->ws->mapped_gtt += real->base.size;<br>
-      real->ws->num_mapped_buffers++;<br>
-   }<br>
    return (uint8_t*)cpu + offset;<br>
 }<br>
<br>
 static void amdgpu_bo_unmap(struct pb_buffer *buf)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;<br>
    struct amdgpu_winsys_bo *real;<br>
<br>
    assert(!bo->sparse);<br>
<br>
-   if (bo->user_ptr)<br>
+   if (bo->is_user_ptr)<br>
       return;<br>
<br>
    real = bo->bo ? bo : bo->u.slab.real;<br>
-<br>
+   assert(real->u.real.map_count != 0 && "too many unmaps");<br>
    if (p_atomic_dec_zero(&real->u.real.map_count)) {<br>
+      assert(!real->cpu_ptr &&<br>
+             "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");<br>
+<br>
       if (real->initial_domain & RADEON_DOMAIN_VRAM)<br>
          real->ws->mapped_vram -= real->base.size;<br>
       else if (real->initial_domain & RADEON_DOMAIN_GTT)<br>
          real->ws->mapped_gtt -= real->base.size;<br>
       real->ws->num_mapped_buffers--;<br>
    }<br>
<br>
    amdgpu_bo_cpu_unmap(real->bo);<br>
 }<br>
<br>
@@ -1446,28 +1477,29 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,<br>
<br>
     if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,<br>
                               aligned_size, 1 << 12, 0, &va, &va_handle,<br>
                              AMDGPU_VA_RANGE_HIGH))<br>
         goto error_va_alloc;<br>
<br>
     if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))<br>
         goto error_va_map;<br>
<br>
     /* Initialize it. */<br>
+    bo->is_user_ptr = true;<br>
     pipe_reference_init(&bo->base.reference, 1);<br>
     simple_mtx_init(&bo->lock, mtx_plain);<br>
     bo->bo = buf_handle;<br>
     bo->base.alignment = 0;<br>
     bo->base.size = size;<br>
     bo->base.vtbl = &amdgpu_winsys_bo_vtbl;<br>
     bo->ws = ws;<br>
-    bo->user_ptr = pointer;<br>
+    bo->cpu_ptr = pointer;<br>
     bo->va = va;<br>
     bo->u.real.va_handle = va_handle;<br>
     bo->initial_domain = RADEON_DOMAIN_GTT;<br>
     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);<br>
<br>
     ws->allocated_gtt += aligned_size;<br>
<br>
     amdgpu_add_buffer_to_global_list(bo);<br>
<br>
     amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);<br>
@@ -1480,21 +1512,21 @@ error_va_map:<br>
 error_va_alloc:<br>
     amdgpu_bo_free(buf_handle);<br>
<br>
 error:<br>
     FREE(bo);<br>
     return NULL;<br>
 }<br>
<br>
 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)<br>
 {<br>
-   return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;<br>
+   return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;<br>
 }<br>
<br>
 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)<br>
 {<br>
    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;<br>
<br>
    return !bo->bo && !bo->sparse;<br>
 }<br>
<br>
 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)<br>
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h<br>
index 58e6eed733d..88f4241327d 100644<br>
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h<br>
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h<br>
@@ -81,24 +81,25 @@ struct amdgpu_winsys_bo {<br>
          uint32_t num_backing_pages;<br>
<br>
          struct list_head backing;<br>
<br>
          /* Commitment information for each page of the virtual memory area. */<br>
          struct amdgpu_sparse_commitment *commitments;<br>
       } sparse;<br>
    } u;<br>
<br>
    struct amdgpu_winsys *ws;<br>
-   void *user_ptr; /* from buffer_from_ptr */<br>
+   void *cpu_ptr; /* for user_ptr and permanent maps */<br>
<br>
    amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */<br>
    bool sparse;<br>
+   bool is_user_ptr;<br>
    bool is_local;<br>
    uint32_t unique_id;<br>
    uint64_t va;<br>
    enum radeon_bo_domain initial_domain;<br>
<br>
    /* how many command streams is this bo referenced in? */<br>
    int num_cs_references;<br>
<br>
    /* how many command streams, which are being emitted in a separate<br>
     * thread, is this bo referenced in? */<br>
-- <br>
2.19.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</blockquote></div>