开发者

Quickest way to change endianness

What is the quickest way to reverse the endianness of a 16 bit and 32 bit integer. I usually do something like (this coding was done in Visual Studio in C++):

union bytes4
{
    __int32 value;
    char ch[4];
};

union bytes2
{
    __int16 value;
    char ch[2];
};

__int16 changeEndianness16(__int16 val)
{
    bytes2 temp;
    temp.value=val;

    char x= temp.ch[0];
    temp.ch[0]=temp.ch[1];
    temp.ch[1]=x;
    return temp.value;
}

_开发者_如何学Python_int32 changeEndianness32(__int32 val)
{
    bytes4 temp;
    temp.value=val;
    char x;

    x= temp.ch[0];
    temp.ch[0]=temp.ch[1];
    temp.ch[1]=x;

    x= temp.ch[2];
    temp.ch[2]=temp.ch[3];
    temp.ch[3]=x;
    return temp.value;
}

Is there any faster way to do the same, in which I don't have to do so many calculations?


Why aren't you using the built-in swab function, which is likely optimized better than your code?

Beyond that, the usual bit-shift operations should be fast to begin with, and are so widely used they may be recognized by the optimizer and replaced by even better code.


Because other answers have serious bugs, I'll post a better implementation:

int16_t changeEndianness16(int16_t val)
{
    return (val << 8) |          // left-shift always fills with zeros
          ((val >> 8) & 0x00ff); // right-shift sign-extends, so force to zero
}

None of the compilers I tested generate rolw for this code, I think a slightly longer sequence (in terms of instruction count) is actually faster. Benchmarks would be interesting.

For 32-bit, there are a few possible orders for the operations:

//version 1
int32_t changeEndianness32(int32_t val)
{
    return (val << 24) |
          ((val <<  8) & 0x00ff0000) |
          ((val >>  8) & 0x0000ff00) |
          ((val >> 24) & 0x000000ff);
}

//version 2, one less OR, but has data dependencies
int32_t changeEndianness32(int32_t val)
{
    int32_t tmp = (val << 16) |
                 ((val >> 16) & 0x00ffff);
    return ((tmp >> 8) & 0x00ff00ff) | ((tmp & 0x00ff00ff) << 8);
}


At least in Visual C++, you can use _byteswap_ulong() and friends: http://msdn.microsoft.com/en-us/library/a3140177.aspx

These functions are treated as intrinsics by the VC++ compiler, and will result in generated code that takes advantage of hardware support when available. With VC++ 10.0 SP1, I see the following generated code for x86:

return _byteswap_ulong(val);

mov     eax, DWORD PTR _val$[esp-4]
bswap   eax
ret     0

return _byteswap_ushort(val);

mov     ax, WORD PTR _val$[esp-4]
mov     ch, al
mov     cl, ah
mov     ax, cx
ret     0


Who says it does too many calculations?

out = changeEndianness16(in);

gcc 4.6.0

movzwl  -4(%rsp), %edx
movl    %edx, %eax
movsbl  %dh, %ecx
movb    %cl, %al
movb    %dl, %ah
movw    %ax, -2(%rsp)

clang++ 2.9

movw    -2(%rsp), %ax
rolw    $8, %ax
movw    %ax, -4(%rsp)

Intel C/C++ 11.1

movzwl    4(%rsp), %ecx
rolw      $8, %cx
xorl      %eax, %eax
movw      %cx, 6(%rsp)

What does your compiler produce?


I used the following code for the 16bit version swap function:

_int16 changeEndianness16(__int16 val)
{
    return ((val & 0x00ff) << 8) | ((val & 0xff00) >> 8);
}    

With g++ (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5 the above code when compiled with g++ -O3 -S -fomit-frame-pointer test.cpp results in the following (non-inlined) assembler code:

movzwl  4(%esp), %eax
rolw    $8, %ax
ret

The next code is equivalent but g++ is not as good at optimizing it.

__int16 changeEndianness16_2(__int16 val)
{
    return ((val & 0xff) << 8) | (val >> 8);
}

Compiling it gives more asm code:

movzwl  4(%esp), %edx
movl    %edx, %eax
sarl    $8, %eax
sall    $8, %edx
orl     %edx, %eax
ret
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜