[Spice-devel] [PATCH spice-common v2] codegen: Make the compiler work out better way to write unaligned memory

Mon Sep 18 14:43:18 UTC 2017

> > 
> > Has this been tested on actual problematic hardware, and improved
> > performance? I remember last time this was discussed, the outcome was
> > not always what was expected.
> > 
> > Christophe
> > 
> 
> I don't remember old discussions, there were actually 2 different ones
> which were confusing as one was referring an old embedded ARM version
> and another a more recent.
> I remember a memcpy version was proposed.
> I don't own an ARM to test.
> Was tested with x64 with no regressions introduced.
> Knowing the machine instruction, which one fault is causing the issue
> and the different platforms I can try and verify the assembly output
> cross compiling.
> 
> Using an Ubuntu machine (looks like Fedora cross compiler is only for
> compiling the kernel) and a cross compiler I got
> 
> 00004284 <parse_msg_disconnecting>:
>     4284:       f100 020c       add.w   r2, r0, #12
>     4288:       428a            cmp     r2, r1
>     428a:       d817            bhi.n   42bc <parse_msg_disconnecting+0x38>
>     428c:       b5f8            push    {r3, r4, r5, r6, r7, lr}
>     428e:       4604            mov     r4, r0
>     4290:       2010            movs    r0, #16
>     4292:       461d            mov     r5, r3
>     4294:       f7ff fffe       bl      0 <malloc>
>     4298:       b170            cbz     r0, 42b8
>     <parse_msg_disconnecting+0x34>
>     429a:       6827            ldr     r7, [r4, #0]
>     429c:       2310            movs    r3, #16
>     429e:       6866            ldr     r6, [r4, #4]
>     42a0:       f240 0200       movw    r2, #0
>     42a4:       68a1            ldr     r1, [r4, #8]
>     42a6:       f2c0 0200       movt    r2, #0
>     42aa:       602b            str     r3, [r5, #0]
>     42ac:       9b06            ldr     r3, [sp, #24]
>     42ae:       6007            str     r7, [r0, #0]
>     42b0:       6046            str     r6, [r0, #4]
>     42b2:       6081            str     r1, [r0, #8]
>     42b4:       601a            str     r2, [r3, #0]
>     42b6:       bdf8            pop     {r3, r4, r5, r6, r7, pc}
>     42b8:       2000            movs    r0, #0
>     42ba:       bdf8            pop     {r3, r4, r5, r6, r7, pc}
>     42bc:       2000            movs    r0, #0
>     42be:       4770            bx      lr
> 
> instead of
> 
> 0000427c <parse_msg_disconnecting>:
>     427c:       f100 020c       add.w   r2, r0, #12
>     4280:       428a            cmp     r2, r1
>     4282:       d817            bhi.n   42b4 <parse_msg_disconnecting+0x38>
>     4284:       b570            push    {r4, r5, r6, lr}
>     4286:       4604            mov     r4, r0
>     4288:       2010            movs    r0, #16
>     428a:       461d            mov     r5, r3
>     428c:       f7ff fffe       bl      0 <malloc>
>     4290:       b170            cbz     r0, 42b0
>     <parse_msg_disconnecting+0x34>
>     4292:       e9d4 2300       ldrd    r2, r3, [r4]
>     4296:       2610            movs    r6, #16
>     4298:       68a4            ldr     r4, [r4, #8]
>     429a:       f240 0100       movw    r1, #0
>     429e:       602e            str     r6, [r5, #0]
>     42a0:       f2c0 0100       movt    r1, #0
>     42a4:       e9c0 2300       strd    r2, r3, [r0]
>     42a8:       9b04            ldr     r3, [sp, #16]
>     42aa:       6084            str     r4, [r0, #8]
>     42ac:       6019            str     r1, [r3, #0]
>     42ae:       bd70            pop     {r4, r5, r6, pc}
>     42b0:       2000            movs    r0, #0
>     42b2:       bd70            pop     {r4, r5, r6, pc}
>     42b4:       2000            movs    r0, #0
>     42b6:       4770            bx      lr
> 
> The platform is gnueabihf. As you can see the first assembly is using
> only ldr/str while second one, assuming alignment of 64 bit, uses ldrd/strd.
> On this platform ldr/str can do unaligned access while ldrd/strd cannot
> (note that ldr is using 32 bit access while ldrd does 64 bit access)
> 

See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka15414.html,
specifically "Instructions which do NOT support unaligned accesses include:"

Frediano

> 
> > On Mon, Sep 18, 2017 at 02:52:57AM -0400, Frediano Ziglio wrote:
> > > ping
> > > 
> > > I think I sent a reply after the new path which could be confusing.
> > > 
> > > > 
> > > > Instead of assuming that the system can safely do unaligned access
> > > > to memory use packed structures to allow the compiler generate
> > > > best code possible.
> > > > A packed structure tells the compiler to not leave padding inside it
> > > > and that the structure can be unaligned so any field can be unaligned
> > > > having to generate proper access code based on architecture.
> > > > For instance ARM7 can use unaligned access but not for 64 bit
> > > > numbers (currently these accesses are emulated by Linux kernel
> > > > with obvious performance consequences).
> > > > 
> > > > This changes the current methods from:
> > > > 
> > > > #ifdef WORDS_BIGENDIAN
> > > > #define read_uint32(ptr) ((uint32_t)SPICE_BYTESWAP32(*((uint32_t
> > > > *)(ptr))))
> > > > #define write_uint32(ptr, val) *(uint32_t *)(ptr) =
> > > > SPICE_BYTESWAP32((uint32_t)val)
> > > > #else
> > > > #define read_uint32(ptr) (*((uint32_t *)(ptr)))
> > > > #define write_uint32(ptr, val) (*((uint32_t *)(ptr))) = val
> > > > #endif
> > > > 
> > > > to:
> > > > #include <spice/start-packed.h>
> > > > typedef struct SPICE_ATTR_PACKED {
> > > >     uint32_t v;
> > > > } uint32_unaligned_t;
> > > > #include <spice/end-packed.h>
> > > > 
> > > > #ifdef WORDS_BIGENDIAN
> > > > #define read_uint32(ptr)
> > > > ((uint32_t)SPICE_BYTESWAP32(((uint32_unaligned_t
> > > > *)(ptr))->v))
> > > > #define write_uint32(ptr, val) ((uint32_unaligned_t *)(ptr))->v =
> > > > SPICE_BYTESWAP32((uint32_t)val)
> > > > #else
> > > > #define read_uint32(ptr) (((uint32_unaligned_t *)(ptr))->v)
> > > > #define write_uint32(ptr, val) (((uint32_unaligned_t *)(ptr))->v) = val
> > > > #endif
> > > > 
> > > > Signed-off-by: Frediano Ziglio <fziglio at redhat.com>
> > > > ---
> > > >  python_modules/demarshal.py | 33 +++++++++++++++++++++++----------
> > > >  1 file changed, 23 insertions(+), 10 deletions(-)
> > > > 
> > > > Changes since v1:
> > > > - improved commit message;
> > > > - added SPICE_ATTR_PACKED to structure definition for compatibility;
> > > > - use intX_unaligned_t instead of intX_unaligned_p.
> > > > 
> > > > diff --git a/python_modules/demarshal.py b/python_modules/demarshal.py
> > > > index 1ea131d..da87d44 100644
> > > > --- a/python_modules/demarshal.py
> > > > +++ b/python_modules/demarshal.py
> > > > @@ -40,24 +40,37 @@ def write_parser_helpers(writer):
> > > >  
> > > >      writer = writer.function_helper()
> > > >  
> > > > +    writer.writeln("#include <spice/start-packed.h>")
> > > > +    for size in [16, 32, 64]:
> > > > +        for sign in ["", "u"]:
> > > > +            type = "%sint%d" % (sign, size)
> > > > +            writer.begin_block("typedef struct SPICE_ATTR_PACKED")
> > > > +            writer.variable_def("%s_t" % type, "v")
> > > > +            writer.end_block(newline=False)
> > > > +            writer.writeln(" %s_unaligned_t;" % type)
> > > > +    writer.writeln("#include <spice/end-packed.h>")
> > > > +    writer.newline()
> > > > +
> > > > +    for sign in ["", "u"]:
> > > > +        type = "%sint8" % sign
> > > > +        writer.macro("read_%s" % type, "ptr", "(*((%s_t *)(ptr)))" %
> > > > type)
> > > > +        writer.macro("write_%s" % type, "ptr, val", "*(%s_t *)(ptr) =
> > > > val" %
> > > > (type))
> > > > +    writer.newline()
> > > > +
> > > >      writer.writeln("#ifdef WORDS_BIGENDIAN")
> > > > -    for size in [8, 16, 32, 64]:
> > > > +    for size in [16, 32, 64]:
> > > >          for sign in ["", "u"]:
> > > >              utype = "uint%d" % (size)
> > > >              type = "%sint%d" % (sign, size)
> > > >              swap = "SPICE_BYTESWAP%d" % size
> > > > -            if size == 8:
> > > > -                writer.macro("read_%s" % type, "ptr", "(*((%s_t
> > > > *)(ptr)))" %
> > > > type)
> > > > -                writer.macro("write_%s" % type, "ptr, val", "*(%s_t
> > > > *)(ptr)
> > > > = val" % (type))
> > > > -            else:
> > > > -                writer.macro("read_%s" % type, "ptr",
> > > > "((%s_t)%s(*((%s_t
> > > > *)(ptr))))" % (type, swap, utype))
> > > > -                writer.macro("write_%s" % type, "ptr, val", "*(%s_t
> > > > *)(ptr)
> > > > = %s((%s_t)val)" % (utype, swap, utype))
> > > > +            writer.macro("read_%s" % type, "ptr",
> > > > "((%s_t)%s(((%s_unaligned_t *)(ptr))->v))" % (type, swap, utype))
> > > > +            writer.macro("write_%s" % type, "ptr, val",
> > > > "((%s_unaligned_t
> > > > *)(ptr))->v = %s((%s_t)val)" % (utype, swap, utype))
> > > >      writer.writeln("#else")
> > > > -    for size in [8, 16, 32, 64]:
> > > > +    for size in [16, 32, 64]:
> > > >          for sign in ["", "u"]:
> > > >              type = "%sint%d" % (sign, size)
> > > > -            writer.macro("read_%s" % type, "ptr", "(*((%s_t *)(ptr)))"
> > > > %
> > > > type)
> > > > -            writer.macro("write_%s" % type, "ptr, val", "(*((%s_t
> > > > *)(ptr)))
> > > > = val" % type)
> > > > +            writer.macro("read_%s" % type, "ptr", "(((%s_unaligned_t
> > > > *)(ptr))->v)" % type)
> > > > +            writer.macro("write_%s" % type, "ptr, val",
> > > > "(((%s_unaligned_t
> > > > *)(ptr))->v) = val" % type)
> > > >      writer.writeln("#endif")
> > > >  
> > > >      for size in [8, 16, 32, 64]: