[PATCH 02/13] glamor: Add glamor_program based copy acceleration

Tue May 13 04:32:35 PDT 2014

Am 2014-05-06 00:02, schrieb Keith Packard:
> +static const glamor_facet glamor_facet_copyplane = {
> +    "copy_plane",
> +    .version = 130,
> +    .vs_vars = "attribute vec2 primitive;\n",
> +    .vs_exec = (GLAMOR_POS(gl_Position, (primitive.xy))
> +                "       fill_pos = (fill_offset + primitive.xy) /
> fill_size;\n"),
> +    .fs_exec = ("       uvec4 bits = uvec4(texture2D(sampler,
> fill_pos) * bitmul + vec4(0.5,0.5,0.5,0.5));\n"

I think
uvec4(round(texture2D(sampler, fill_pos) * bitmul))
is easier to read than to add 0.5. Honestly GLSL lacks rounding + 
convertion functions :/

> +static void
> +glamor_copy_bail(DrawablePtr src,
> +                 DrawablePtr dst,
> +                 GCPtr gc,
> +                 BoxPtr box,
> +                 int nbox,
> +                 int dx,
> +                 int dy,
> +                 Bool reverse,
> +                 Bool upsidedown,
> +                 Pixel bitplane,
> +                 void *closure)
> +{
> +    if (glamor_prepare_access(dst, GLAMOR_ACCESS_RW) &&
> glamor_prepare_access(src, GLAMOR_ACCESS_RO)) {
> +        if (bitplane) {
> +            if (src->bitsPerPixel > 1)
> +                fbCopyNto1(src, dst, gc, box, nbox, dx, dy,
> +                           reverse, upsidedown, bitplane, closure);
> +            else
> +                fbCopy1toN(src, dst, gc, box, nbox, dx, dy,
> +                           reverse, upsidedown, bitplane, closure);
> +        } else {
> +            fbCopyNtoN(src, dst, gc, box, nbox, dx, dy,
> +                       reverse, upsidedown, bitplane, closure);
> +        }

This logic should be in fbCopyNtoN imo.

> +static Bool
> +glamor_copy_fbo_fbo_draw(DrawablePtr src,
...
> +    /* Set up the vertex buffers for the points */
> +
> +    v = glamor_get_vbo_space(dst->pScreen, nbox * 8 * sizeof
> (int16_t), &vbo_offset);
> +
> +    glEnableVertexAttribArray(GLAMOR_VERTEX_POS);
> +    glVertexAttribPointer(GLAMOR_VERTEX_POS, 2, GL_SHORT, GL_FALSE,
> +                          2 * sizeof (GLshort), vbo_offset);
> +
> +    for (n = 0; n < nbox; n++) {
> +        v[0] = box->x1; v[1] = box->y1;
> +        v[2] = box->x1; v[3] = box->y2;
> +        v[4] = box->x2; v[5] = box->y2;
> +        v[6] = box->x2; v[7] = box->y1;
> +        v += 8;
> +        box++;
> +    }

Why didn't you use instancing this time? We'd be able to memcpy here.

> +
> +    glamor_put_vbo_space(screen);
> +
> +    glamor_get_drawable_deltas(src, src_pixmap, &src_off_x, 
> &src_off_y);
> +
> +    set_scissor = src_priv->type == GLAMOR_TEXTURE_LARGE;
> +    if (set_scissor)
> +        glEnable(GL_SCISSOR_TEST);
> +
> +    glamor_pixmap_loop(src_priv, src_box_x, src_box_y) {
> +        BoxPtr src_box = glamor_pixmap_box_at(src_priv, src_box_x, 
> src_box_y);
> +
> +        args.dx = dx + src_off_x - src_box->x1;
> +        args.dy = dy + src_off_y - src_box->y1;
> +        args.src = glamor_pixmap_fbo_at(src_priv, src_box_x, 
> src_box_y);
> +
> +        if (!glamor_use_program(dst_pixmap, gc, prog, &args))
> +            goto bail_ctx;
> +
> +        glamor_pixmap_loop(dst_priv, dst_box_x, dst_box_y) {
> +            glamor_set_destination_drawable(dst, dst_box_x,
> dst_box_y, FALSE, FALSE,
> +                                            prog->matrix_uniform,
> &dst_off_x, &dst_off_y);

I'm a bit worried about this loop order. We don't change the programm at 
all, so glamor_use_program will be almost a no-op. But we change the FBO 
in the inner loop. Large textures are quite uncommon, but this sounds 
like a waste of time for me.

> +            if (set_scissor)
> +                glScissor(dst_off_x - args.dx,
> +                          dst_off_y - args.dy,
> +                          src_box->x2 - src_box->x1,
> +                          src_box->y2 - src_box->y1);

So there is still no util function for this scissor box handling?

> +            if (glamor_priv->gl_flavor == GLAMOR_GL_DESKTOP)
> +                glDrawArrays(GL_QUADS, 0, nbox * 4);
> +            else {
> +                int i;
> +                for (i = 0; i < nbox; i++)
> +                    glDrawArrays(GL_TRIANGLE_FAN, i*4, 4);
> +            }

iirc we always have an element buffer bound for quads, so we could use 
glDrawElements instead of this for loop.

> +/*
> + * Copy from GPU to GPU, but create
> + * a temporary pixmap in the middle to
> + * correctly handle overlapping copies
> + * in systems lacking glCopyPixels
> + */

Outdated comment, glCopyPixels isn't used any more

> +static Bool
> +glamor_copy_fbo_fbo_temp(DrawablePtr src,
...
> +    /* Sanity check state to avoid getting halfway through and bailing
> +     * at the last second. Might be nice to have checks that didn't
> +     * involve setting state.
> +     */
> +    glamor_make_current(glamor_priv);
> +
> +    if (gc && !glamor_set_planemask(dst_pixmap, gc->planemask))
> +        goto bail_ctx;
> +
> +    if (!glamor_set_alu(screen, gc ? gc->alu : GXcopy))
> +        goto bail_ctx;
> +    glDisable(GL_COLOR_LOGIC_OP);

What would be the "last second"? I guess the first copy is always fine 
and the second one will fail. So we always have to stall and to readback 
a texture. imo it doesn't matter that much to make a gpu based copy 
first. In the end, it's likely faster for FB to copy without 
overlapping.

> +static Bool
> +glamor_copy_needs_temp(DrawablePtr src,
...
> +    glTextureBarrierNV();

I think a comment is required here why we have to call 
glTextureBarrierNV at all. eg what happens when we get two copy calls in 
a row, both doesn't overlap, but they overlap each other. So the second 
call might want to read the result of the first copy which isn't allowed 
without this barrier.

-----------------------------

I'm very happy to see this copy code without any deprecated api usage :)

Reviewed-by: Markus Wick <markus at selfnet.de>