[Spice-devel] [PATCH spice-common v2] codegen: Make the compiler work out better way to write unaligned memory

Mon Sep 18 14:27:01 UTC 2017

> 
> Has this been tested on actual problematic hardware, and improved
> performance? I remember last time this was discussed, the outcome was
> not always what was expected.
> 
> Christophe
> 

I don't remember old discussions, there were actually 2 different ones
which were confusing as one was referring an old embedded ARM version
and another a more recent.
I remember a memcpy version was proposed.
I don't own an ARM to test.
Was tested with x64 with no regressions introduced.
Knowing the machine instruction, which one fault is causing the issue
and the different platforms I can try and verify the assembly output
cross compiling.

Using an Ubuntu machine (looks like Fedora cross compiler is only for
compiling the kernel) and a cross compiler I got 

00004284 <parse_msg_disconnecting>:
    4284:       f100 020c       add.w   r2, r0, #12
    4288:       428a            cmp     r2, r1
    428a:       d817            bhi.n   42bc <parse_msg_disconnecting+0x38>
    428c:       b5f8            push    {r3, r4, r5, r6, r7, lr}
    428e:       4604            mov     r4, r0
    4290:       2010            movs    r0, #16
    4292:       461d            mov     r5, r3
    4294:       f7ff fffe       bl      0 <malloc>
    4298:       b170            cbz     r0, 42b8 <parse_msg_disconnecting+0x34>
    429a:       6827            ldr     r7, [r4, #0]
    429c:       2310            movs    r3, #16
    429e:       6866            ldr     r6, [r4, #4]
    42a0:       f240 0200       movw    r2, #0
    42a4:       68a1            ldr     r1, [r4, #8]
    42a6:       f2c0 0200       movt    r2, #0
    42aa:       602b            str     r3, [r5, #0]
    42ac:       9b06            ldr     r3, [sp, #24]
    42ae:       6007            str     r7, [r0, #0]
    42b0:       6046            str     r6, [r0, #4]
    42b2:       6081            str     r1, [r0, #8]
    42b4:       601a            str     r2, [r3, #0]
    42b6:       bdf8            pop     {r3, r4, r5, r6, r7, pc}
    42b8:       2000            movs    r0, #0
    42ba:       bdf8            pop     {r3, r4, r5, r6, r7, pc}
    42bc:       2000            movs    r0, #0
    42be:       4770            bx      lr

instead of

0000427c <parse_msg_disconnecting>:
    427c:       f100 020c       add.w   r2, r0, #12
    4280:       428a            cmp     r2, r1
    4282:       d817            bhi.n   42b4 <parse_msg_disconnecting+0x38>
    4284:       b570            push    {r4, r5, r6, lr}
    4286:       4604            mov     r4, r0
    4288:       2010            movs    r0, #16
    428a:       461d            mov     r5, r3
    428c:       f7ff fffe       bl      0 <malloc>
    4290:       b170            cbz     r0, 42b0 <parse_msg_disconnecting+0x34>
    4292:       e9d4 2300       ldrd    r2, r3, [r4]
    4296:       2610            movs    r6, #16
    4298:       68a4            ldr     r4, [r4, #8]
    429a:       f240 0100       movw    r1, #0
    429e:       602e            str     r6, [r5, #0]
    42a0:       f2c0 0100       movt    r1, #0
    42a4:       e9c0 2300       strd    r2, r3, [r0]
    42a8:       9b04            ldr     r3, [sp, #16]
    42aa:       6084            str     r4, [r0, #8]
    42ac:       6019            str     r1, [r3, #0]
    42ae:       bd70            pop     {r4, r5, r6, pc}
    42b0:       2000            movs    r0, #0
    42b2:       bd70            pop     {r4, r5, r6, pc}
    42b4:       2000            movs    r0, #0
    42b6:       4770            bx      lr

The platform is gnueabihf. As you can see the first assembly is using
only ldr/str while second one, assuming alignment of 64 bit, uses ldrd/strd.
On this platform ldr/str can do unaligned access while ldrd/strd cannot
(note that ldr is using 32 bit access while ldrd does 64 bit access)

Frediano

> On Mon, Sep 18, 2017 at 02:52:57AM -0400, Frediano Ziglio wrote:
> > ping
> > 
> > I think I sent a reply after the new path which could be confusing.
> > 
> > > 
> > > Instead of assuming that the system can safely do unaligned access
> > > to memory use packed structures to allow the compiler generate
> > > best code possible.
> > > A packed structure tells the compiler to not leave padding inside it
> > > and that the structure can be unaligned so any field can be unaligned
> > > having to generate proper access code based on architecture.
> > > For instance ARM7 can use unaligned access but not for 64 bit
> > > numbers (currently these accesses are emulated by Linux kernel
> > > with obvious performance consequences).
> > > 
> > > This changes the current methods from:
> > > 
> > > #ifdef WORDS_BIGENDIAN
> > > #define read_uint32(ptr) ((uint32_t)SPICE_BYTESWAP32(*((uint32_t
> > > *)(ptr))))
> > > #define write_uint32(ptr, val) *(uint32_t *)(ptr) =
> > > SPICE_BYTESWAP32((uint32_t)val)
> > > #else
> > > #define read_uint32(ptr) (*((uint32_t *)(ptr)))
> > > #define write_uint32(ptr, val) (*((uint32_t *)(ptr))) = val
> > > #endif
> > > 
> > > to:
> > > #include <spice/start-packed.h>
> > > typedef struct SPICE_ATTR_PACKED {
> > >     uint32_t v;
> > > } uint32_unaligned_t;
> > > #include <spice/end-packed.h>
> > > 
> > > #ifdef WORDS_BIGENDIAN
> > > #define read_uint32(ptr) ((uint32_t)SPICE_BYTESWAP32(((uint32_unaligned_t
> > > *)(ptr))->v))
> > > #define write_uint32(ptr, val) ((uint32_unaligned_t *)(ptr))->v =
> > > SPICE_BYTESWAP32((uint32_t)val)
> > > #else
> > > #define read_uint32(ptr) (((uint32_unaligned_t *)(ptr))->v)
> > > #define write_uint32(ptr, val) (((uint32_unaligned_t *)(ptr))->v) = val
> > > #endif
> > > 
> > > Signed-off-by: Frediano Ziglio <fziglio at redhat.com>
> > > ---
> > >  python_modules/demarshal.py | 33 +++++++++++++++++++++++----------
> > >  1 file changed, 23 insertions(+), 10 deletions(-)
> > > 
> > > Changes since v1:
> > > - improved commit message;
> > > - added SPICE_ATTR_PACKED to structure definition for compatibility;
> > > - use intX_unaligned_t instead of intX_unaligned_p.
> > > 
> > > diff --git a/python_modules/demarshal.py b/python_modules/demarshal.py
> > > index 1ea131d..da87d44 100644
> > > --- a/python_modules/demarshal.py
> > > +++ b/python_modules/demarshal.py
> > > @@ -40,24 +40,37 @@ def write_parser_helpers(writer):
> > >  
> > >      writer = writer.function_helper()
> > >  
> > > +    writer.writeln("#include <spice/start-packed.h>")
> > > +    for size in [16, 32, 64]:
> > > +        for sign in ["", "u"]:
> > > +            type = "%sint%d" % (sign, size)
> > > +            writer.begin_block("typedef struct SPICE_ATTR_PACKED")
> > > +            writer.variable_def("%s_t" % type, "v")
> > > +            writer.end_block(newline=False)
> > > +            writer.writeln(" %s_unaligned_t;" % type)
> > > +    writer.writeln("#include <spice/end-packed.h>")
> > > +    writer.newline()
> > > +
> > > +    for sign in ["", "u"]:
> > > +        type = "%sint8" % sign
> > > +        writer.macro("read_%s" % type, "ptr", "(*((%s_t *)(ptr)))" %
> > > type)
> > > +        writer.macro("write_%s" % type, "ptr, val", "*(%s_t *)(ptr) =
> > > val" %
> > > (type))
> > > +    writer.newline()
> > > +
> > >      writer.writeln("#ifdef WORDS_BIGENDIAN")
> > > -    for size in [8, 16, 32, 64]:
> > > +    for size in [16, 32, 64]:
> > >          for sign in ["", "u"]:
> > >              utype = "uint%d" % (size)
> > >              type = "%sint%d" % (sign, size)
> > >              swap = "SPICE_BYTESWAP%d" % size
> > > -            if size == 8:
> > > -                writer.macro("read_%s" % type, "ptr", "(*((%s_t
> > > *)(ptr)))" %
> > > type)
> > > -                writer.macro("write_%s" % type, "ptr, val", "*(%s_t
> > > *)(ptr)
> > > = val" % (type))
> > > -            else:
> > > -                writer.macro("read_%s" % type, "ptr", "((%s_t)%s(*((%s_t
> > > *)(ptr))))" % (type, swap, utype))
> > > -                writer.macro("write_%s" % type, "ptr, val", "*(%s_t
> > > *)(ptr)
> > > = %s((%s_t)val)" % (utype, swap, utype))
> > > +            writer.macro("read_%s" % type, "ptr",
> > > "((%s_t)%s(((%s_unaligned_t *)(ptr))->v))" % (type, swap, utype))
> > > +            writer.macro("write_%s" % type, "ptr, val",
> > > "((%s_unaligned_t
> > > *)(ptr))->v = %s((%s_t)val)" % (utype, swap, utype))
> > >      writer.writeln("#else")
> > > -    for size in [8, 16, 32, 64]:
> > > +    for size in [16, 32, 64]:
> > >          for sign in ["", "u"]:
> > >              type = "%sint%d" % (sign, size)
> > > -            writer.macro("read_%s" % type, "ptr", "(*((%s_t *)(ptr)))" %
> > > type)
> > > -            writer.macro("write_%s" % type, "ptr, val", "(*((%s_t
> > > *)(ptr)))
> > > = val" % type)
> > > +            writer.macro("read_%s" % type, "ptr", "(((%s_unaligned_t
> > > *)(ptr))->v)" % type)
> > > +            writer.macro("write_%s" % type, "ptr, val",
> > > "(((%s_unaligned_t
> > > *)(ptr))->v) = val" % type)
> > >      writer.writeln("#endif")
> > >  
> > >      for size in [8, 16, 32, 64]: