[Mesa-dev] [PATCH] Replace byte-swapping code with builtins in pack.c

Mon Oct 9 05:34:14 UTC 2017

On Sun, Oct 8, 2017 at 12:15 PM, Jochen Rollwagen <joro-2013 at t-online.de> wrote:
> Am 06.10.2017 um 11:57 schrieb Nicolai Hähnle:
>>
>> On 05.10.2017 20:59, Jochen Rollwagen wrote:
>>>
>>> Am 04.10.2017 um 05:59 schrieb Matt Turner:
>>>>
>>>> On Tue, Oct 3, 2017 at 11:01 AM, Jochen Rollwagen
>>>> <joro-2013 at t-online.de> wrote:
>>>>>
>>>>>  From 4cebe50a9bade6717923e104c954f3fad75f71bb Mon Sep 17 00:00:00 2001
>>>>> From: Jochen Rollwagen <joro-2013 at t-online.de>
>>>>> Date: Tue, 3 Oct 2017 19:54:10 +0200
>>>>> Subject: [PATCH] Replace byte-swapping code with builtins in pack.c
>>>>>
>>>>> This patch replaces some code for byte-swapping in pack.c with the
>>>>> builtin
>>>>> functions allowing the compiler to do its optimization magic
>>>>> ---
>>>>>   src/mesa/main/pack.c |   22 ++--------------------
>>>>>   1 file changed, 2 insertions(+), 20 deletions(-)
>>>>>
>>>>> diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
>>>>> index 94a6d28..9bfde39 100644
>>>>> --- a/src/mesa/main/pack.c
>>>>> +++ b/src/mesa/main/pack.c
>>>>> @@ -230,26 +230,8 @@ _mesa_pack_bitmap( GLint width, GLint height,
>>>>> const
>>>>> GLubyte
>>>>>   *source,
>>>>>      }
>>>>>   }
>>>>>
>>>>> -
>>>>> -#define SWAP2BYTE(VALUE)            \
>>>>> -   {                        \
>>>>> -      GLubyte *bytes = (GLubyte *) &(VALUE);    \
>>>>> -      GLubyte tmp = bytes[0];            \
>>>>> -      bytes[0] = bytes[1];            \
>>>>> -      bytes[1] = tmp;                \
>>>>> -   }
>>>>> -
>>>>> -#define SWAP4BYTE(VALUE)            \
>>>>> -   {                        \
>>>>> -      GLubyte *bytes = (GLubyte *) &(VALUE);    \
>>>>> -      GLubyte tmp = bytes[0];            \
>>>>> -      bytes[0] = bytes[3];            \
>>>>> -      bytes[3] = tmp;                \
>>>>> -      tmp = bytes[1];                \
>>>>> -      bytes[1] = bytes[2];            \
>>>>> -      bytes[2] = tmp;                \
>>>>> -   }
>>>>> -
>>>>> +#define SWAP2BYTE(VALUE) __builtin_bswap16(VALUE)
>>>>> +#define SWAP4BYTE(VALUE) __builtin_bswap32(VALUE)
>>>>
>>>> In my experience it's much simpler to just write these as
>>>>
>>>>         return ((x & 0xff) << 8) | ((x >> 8) & 0xff);
>>>>
>>>> and
>>>>
>>>>         return  ((x & 0xff) << 24) |
>>>>                 ((x & 0xff00) << 8) |
>>>>                 ((x & 0xff0000) >> 8) |
>>>>                 ((x >> 24) & 0xff);
>>>>
>>>> and not have to deal with compiler intrinsics. Compilers will
>>>> recognize these patterns and use the appropriate instructions (rol for
>>>> 2-bytes and bswap for 4-bytes).
>>>>
>>>> You should be able to count the numbers of those instructions before
>>>> and after such a patch to confirm it's working as expected.
>>>
>>> Fair enough. I've attached a new patch that optimizes the code on linux
>>> systems only where there is byteswap.h containing (hopefully optimal)
>>> functions. The other systems may be dealt with by a followup patch if
>>> desired.
>>
>>
>> This doesn't really address Matt's point that compilers are good at
>> pattern matching byte swaps already.
>>
>> Unless you can actually show with comparisons of the assembly and/or
>> benchmarks that this results in better code, your patch makes the code base
>> slightly worse because you've now added two different code paths where there
>> was previously only one.
>>
>> Cheers,
>> Nicolai
>
> I'm afraid you're overestimating the compiler's abilities to detect such a
> byte swapping pattern.
>
> The following c code
>
> #include <stdint.h>
>
> static uint32_t
> SWAP4BYTE(uint32_t n)
> {
>    return (n >> 24) |
>           ((n >> 8) & 0x0000ff00) |
>           ((n << 8) & 0x00ff0000) |
>           (n << 24);
> }
>
> static uint32_t
> builtin_SWAP4BYTE(uint32_t n)
> {
>    return __builtin_bswap32(n);
> }
>
> main(){
> SWAP4BYTE(321764);
> buitlin_SWAP4BYTE(321764);
> }
>
> compiles to the following assembler code on powerpc platforms with gcc
> --version gcc (Ubuntu 6.2.0-3ubuntu11~12.04) 6.2.0 20160901:
>
>     .section    ".text"
>     .align 2
>     .type    SWAP4BYTE, @function
> SWAP4BYTE:
>     stwu 1,-32(1)
>     stw 31,28(1)
>     mr 31,1
>     stw 3,12(31)
>     lwz 9,12(31)
>     srwi 10,9,24
>     lwz 9,12(31)
>     srwi 9,9,8
>     rlwinm 9,9,0,16,23
>     or 10,10,9
>     lwz 9,12(31)
>     slwi 9,9,8
>     rlwinm 9,9,0,8,15
>     or 10,10,9
>     lwz 9,12(31)
>     slwi 9,9,24
>     or 9,10,9
>     mr 3,9
>     addi 11,31,32
>     lwz 31,-4(11)
>     mr 1,11
>     blr
>     .size    SWAP4BYTE,.-SWAP4BYTE
>     .align 2
>     .type    builtin_SWAP4BYTE, @function
> builtin_SWAP4BYTE:
>     stwu 1,-32(1)
>     stw 31,28(1)
>     mr 31,1
>     stw 3,12(31)
>     addi 10,31,12
>     lwbrx 9,0,10
>     mr 3,9
>     addi 11,31,32
>     lwz 31,-4(11)
>     mr 1,11
>     blr
>     .size    builtin_SWAP4BYTE,.-builtin_SWAP4BYTE
>     .align 2
>     .globl main
>     .type    main, @function
>
> The builtin function seems slightly more optimized.

Compile with optimization, like a real build. With -O2 it produces
identical assembly on PowerPC for me.