Why does arm-gcc decrement/increment the stack pointer even when the stack is never accessed?

2023-03-04 08:58 问答作者：

When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
}

...this is what I get. Notice the two useless instructions marked with @<<<

_Z5crossRK19__simd128_float32_tS1_:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #80 @<<<
    vswp    d17, d16
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80 @<<<
    bx

The stack is never accessed, yet the stack pointer get decremented, then incremented by the same amount. Why?

If I modify the original code to include asm comment at the end of the prologue and the begining of the epilogue, like this:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    asm volatile("# End of prologue");
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
   开发者_运维技巧     x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
    asm volatile("# Start of epilogue");
    return res;
}

Then I get slightly different version:

_Z5crossRK19__simd128_float32_tS1_:
    sub sp, sp, #80
    # End of prologue
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    vswp    d17, d16
    # Start of epilogue
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80
    bx  lr

The stack pointer decrement/increment clearly is part of the prologue/epilogue, and happens even if the stack is not used. Is that to comply with some standard, or is it a gcc optimization bug?

EDIT: Compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/configure --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu

EDIT: I managed to pinpoint the problem using the following C source. It only happens when using arrays of vector types as temporaries, such as float32x4x2_t which is declared as struct { float32x4_t val[2]; }, even tho these temporaries are made registers. I believe this is a bug, so I reported it.

#include <arm_neon.h>

// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
    return vaddq_f32(*v1, *v2);
#if 0
    produced assembly:

add:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d18-d19}
    vadd.f32    q8, q8, q9
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    bx  lr

#endif
}

// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
    float32x4x2_t
        xxyyzz1=vzipq_f32(*v1, *v1),
        xxyyzz2=vzipq_f32(*v2, *v2);

    float32x2_t
        xx1=vget_low_f32(xxyyzz1.val[0]),
        yy1=vget_high_f32(xxyyzz1.val[0]),
        zz1=vget_low_f32(xxyyzz1.val[1]),
        xx2=vget_low_f32(xxyyzz2.val[0]),
        yy2=vget_high_f32(xxyyzz2.val[0]),
        zz2=vget_low_f32(xxyyzz2.val[1]);

    float32x2_t
        x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
        y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
        z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
    produced assembly:

cross:
    vldmia  r0, {d18-d19}
    vldmia  r1, {d16-d17}
    vmov    q10, q9
    vmov    q11, q8
    vzip.32 q9, q10
    vzip.32 q8, q11
    vmov    d24, d19
    vmov    d21, d16
    vmov    d16, d17
    vmul.f32    d19, d20, d21
    vmul.f32    d17, d24, d22
    vmls.f32    d17, d20, d16
    vmls.f32    d19, d18, d22
    vmul.f32    d16, d18, d16
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #48           @ here
    vswp    d17, d16
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    add sp, sp, #48           @ and here
    bx  lr

#endif
}

This turned out to be a bug, so closing it.

继续阅读：arm assembly gcc neon

Why does arm-gcc decrement/increment the stack pointer even when the stack is never accessed?

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？