<div dir="ltr"><div dir="ltr"><div>I'll remove the env var.</div><div><br></div><div>Additionally, I'm amending this:</div><div><br></div><div>diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c<br>index 3f8db7cf4f0..4936eb5a5b1 100644<br>--- a/src/gallium/drivers/radeonsi/si_buffer.c<br>+++ b/src/gallium/drivers/radeonsi/si_buffer.c<br>@@ -461,10 +461,20 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,<br>                    si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||<br>                    !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {<br>                        /* Do a wait-free write-only transfer using a temporary buffer. */<br>-                       unsigned offset;<br>+                       struct u_upload_mgr *uploader;<br>                        struct si_resource *staging = NULL;<br>+                       unsigned offset;<br>+<br>+                       /* If we are not called from the driver thread, we have<br>+                        * to use the uploader from u_threaded_context, which is<br>+                        * local to the calling thread.<br>+                        */<br>+                       if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)<br>+                               uploader = sctx->tc->base.stream_uploader;<br>+                       else<br>+                               uploader = sctx->b.stream_uploader;<br> <br>-                       u_upload_alloc(ctx->stream_uploader, 0,<br>+                       u_upload_alloc(uploader, 0,<br>                                        box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),<br>                                       sctx->screen->info.tcc_cache_line_size,<br>                                       &offset, (struct pipe_resource**)&staging,<br></div><div><br></div><div>Marek<br></div><div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Mon, Feb 11, 2019 at 4:38 AM Nicolai Hähnle <<a href="mailto:nhaehnle@gmail.com">nhaehnle@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On 07.02.19 02:22, Marek Olšák wrote:<br>
> +     bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs && debug_get_bool_option("SDMA", true);<br>
<br>
Could you please namespace the environment variable, e.g. RADEONSI_SDMA?<br>
<br>
Apart from that, series is<br>
<br>
Reviewed-by: Nicolai Hähnle <<a href="mailto:nicolai.haehnle@amd.com" target="_blank">nicolai.haehnle@amd.com</a>><br>
<br>
<br>
> +     sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,<br>
> +                                              0, PIPE_USAGE_DEFAULT,<br>
> +                                              SI_RESOURCE_FLAG_32BIT |<br>
> +                                              (use_sdma_upload ?<br>
> +                                                       SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :<br>
> +                                                       (sscreen->cpdma_prefetch_writes_memory ?<br>
> +                                                                0 : SI_RESOURCE_FLAG_READ_ONLY)));<br>
> +     if (!sctx->b.const_uploader)<br>
> +             goto fail;<br>
> +<br>
> +     if (use_sdma_upload)<br>
> +             u_upload_enable_flush_explicit(sctx->b.const_uploader);<br>
> +<br>
>       si_init_buffer_functions(sctx);<br>
>       si_init_clear_functions(sctx);<br>
>       si_init_blit_functions(sctx);<br>
>       si_init_compute_functions(sctx);<br>
>       si_init_compute_blit_functions(sctx);<br>
>       si_init_debug_functions(sctx);<br>
>       si_init_msaa_functions(sctx);<br>
>       si_init_streamout_functions(sctx);<br>
>   <br>
>       if (sscreen->info.has_hw_decode) {<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index b01d5744752..b208bdeb848 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -103,20 +103,22 @@<br>
>   #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024<br>
>   <br>
>   #define SI_RESOURCE_FLAG_TRANSFER   (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)<br>
>   #define SI_RESOURCE_FLAG_FLUSHED_DEPTH      (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)<br>
>   #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)<br>
>   #define SI_RESOURCE_FLAG_DISABLE_DCC        (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)<br>
>   #define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)<br>
>   #define SI_RESOURCE_FLAG_READ_ONLY  (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)<br>
>   #define SI_RESOURCE_FLAG_32BIT              (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)<br>
>   #define SI_RESOURCE_FLAG_CLEAR              (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)<br>
> +/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */<br>
> +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA  (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)<br>
>   <br>
>   enum si_clear_code<br>
>   {<br>
>       DCC_CLEAR_COLOR_0000   = 0x00000000,<br>
>       DCC_CLEAR_COLOR_0001   = 0x40404040,<br>
>       DCC_CLEAR_COLOR_1110   = 0x80808080,<br>
>       DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,<br>
>       DCC_CLEAR_COLOR_REG    = 0x20202020,<br>
>       DCC_UNCOMPRESSED       = 0xFFFFFFFF,<br>
>   };<br>
> @@ -769,20 +771,28 @@ struct si_saved_cs {<br>
>       struct si_context       *ctx;<br>
>       struct radeon_saved_cs  gfx;<br>
>       struct si_resource      *trace_buf;<br>
>       unsigned                trace_id;<br>
>   <br>
>       unsigned                gfx_last_dw;<br>
>       bool                    flushed;<br>
>       int64_t                 time_flush;<br>
>   };<br>
>   <br>
> +struct si_sdma_upload {<br>
> +     struct si_resource      *dst;<br>
> +     struct si_resource      *src;<br>
> +     unsigned                src_offset;<br>
> +     unsigned                dst_offset;<br>
> +     unsigned                size;<br>
> +};<br>
> +<br>
>   struct si_context {<br>
>       struct pipe_context             b; /* base class */<br>
>   <br>
>       enum radeon_family              family;<br>
>       enum chip_class                 chip_class;<br>
>   <br>
>       struct radeon_winsys            *ws;<br>
>       struct radeon_winsys_ctx        *ctx;<br>
>       struct radeon_cmdbuf            *gfx_cs;<br>
>       struct radeon_cmdbuf            *dma_cs;<br>
> @@ -1074,20 +1084,26 @@ struct si_context {<br>
>       int                             num_perfect_occlusion_queries;<br>
>       struct list_head                active_queries;<br>
>       unsigned                        num_cs_dw_queries_suspend;<br>
>   <br>
>       /* Render condition. */<br>
>       struct pipe_query               *render_cond;<br>
>       unsigned                        render_cond_mode;<br>
>       bool                            render_cond_invert;<br>
>       bool                            render_cond_force_off; /* for u_blitter */<br>
>   <br>
> +     /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */<br>
> +     bool                            sdma_uploads_in_progress;<br>
> +     struct si_sdma_upload           *sdma_uploads;<br>
> +     unsigned                        num_sdma_uploads;<br>
> +     unsigned                        max_sdma_uploads;<br>
> +<br>
>       /* Statistics gathering for the DCC enablement heuristic. It can't be<br>
>        * in si_texture because si_texture can be shared by multiple<br>
>        * contexts. This is for back buffers only. We shouldn't get too many<br>
>        * of those.<br>
>        *<br>
>        * X11 DRI3 rotates among a finite set of back buffers. They should<br>
>        * all fit in this array. If they don't, separate DCC might never be<br>
>        * enabled by DCC stat gathering.<br>
>        */<br>
>       struct {<br>
> @@ -1273,20 +1289,21 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,<br>
>                                         struct tc_unflushed_batch_token *tc_token);<br>
>   <br>
>   /* si_get.c */<br>
>   void si_init_screen_get_functions(struct si_screen *sscreen);<br>
>   <br>
>   /* si_gfx_cs.c */<br>
>   void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,<br>
>                    struct pipe_fence_handle **fence);<br>
>   void si_begin_new_gfx_cs(struct si_context *ctx);<br>
>   void si_need_gfx_cs_space(struct si_context *ctx);<br>
> +void si_unref_sdma_uploads(struct si_context *sctx);<br>
>   <br>
>   /* si_gpu_load.c */<br>
>   void si_gpu_load_kill_thread(struct si_screen *sscreen);<br>
>   uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);<br>
>   unsigned si_end_counter(struct si_screen *sscreen, unsigned type,<br>
>                       uint64_t begin);<br>
>   <br>
>   /* si_compute.c */<br>
>   void si_init_compute_functions(struct si_context *sctx);<br>
>   <br>
> <br>
<br>
-- <br>
Lerne, wie die Welt wirklich ist,<br>
Aber vergiss niemals, wie sie sein sollte.<br>
</blockquote></div></div></div></div>