1 year ago

#20910

test-img

Francis Cugler

Comparing C++17 Compilers and their generated optimized assembly

Here's a basic working C++ program.

#include <string>
#include <string_view>
#include <iostream>

template<typename ValueType>
struct BasicType {
    ValueType value{};
    std::string type{};

    BasicType(ValueType T, const std::string_view desc) :
    value{T}, type{desc} {}
};

template<typename T>
std::ostream& operator<<(std::ostream& out, const BasicType<T> BT ) {
    out << BT.value << BT.type;
    return out;
}

struct IntT : public BasicType<int> {
    IntT(int value, const std::string_view desc) : 
      BasicType(value, desc) 
    {}
};
struct FloatT : public BasicType<float> {
    FloatT(float value, const std::string_view desc) : 
      BasicType(value, desc) 
    {}
};

int main() {
    IntT hours(3, "hrs");
    FloatT seconds(2.5f, "s");

    std::cout << hours << " and " << seconds;

    return 0;  
}

And it's output is quite obvious

3hrs and 2.5s

There is no issue with the code and it's just a test sample. I was however curious as to how some of the major different compilers treat this code differently to better understand their internal workings and the pros/cons of each compiler.

I used Compiler Explorer to test this out just to look at the differences in the generated assembly. I'm compiling each under c++17 with O2 optimization for performance speed rather than O1 for code size.


Here's the generated assembly from Clang(trunk) with compiler flags set to: -std=C++17 -O2

main:                                   # @main
        push    rbp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        sub     rsp, 168
        mov     dword ptr [rsp + 128], 3
        lea     r15, [rsp + 152]
        mov     qword ptr [rsp + 136], r15
        mov     dword ptr [rsp + 152], 7565928
        mov     qword ptr [rsp + 144], 3
        mov     dword ptr [rsp + 48], 1075838976
        lea     r12, [rsp + 72]
        mov     qword ptr [rsp + 56], r12
        mov     word ptr [rsp + 72], 115
        mov     qword ptr [rsp + 64], 1
        mov     dword ptr [rsp + 88], 3
        lea     r13, [rsp + 112]
        mov     qword ptr [rsp + 96], r13
        mov     dword ptr [rsp + 112], 7565928
        mov     qword ptr [rsp + 104], 3
        mov     edi, offset std::cout
        mov     esi, 3
        call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
        mov     rsi, qword ptr [rsp + 96]
        mov     rdx, qword ptr [rsp + 104]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     edi, offset std::cout
        mov     esi, offset .L.str.2
        mov     edx, 5
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movss   xmm0, dword ptr [rsp + 48]      # xmm0 = mem[0],zero,zero,zero
        movss   dword ptr [rsp + 8], xmm0
        lea     rbp, [rsp + 32]
        mov     qword ptr [rsp + 16], rbp
        mov     r14, qword ptr [rsp + 56]
        mov     rbx, qword ptr [rsp + 64]
        mov     qword ptr [rsp], rbx
        cmp     rbx, 15
        jbe     .LBB0_4
        lea     rdi, [rsp + 16]
        mov     rsi, rsp
        xor     edx, edx
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_create(unsigned long&, unsigned long)
        mov     qword ptr [rsp + 16], rax
        mov     rcx, qword ptr [rsp]
        mov     qword ptr [rsp + 32], rcx
        test    rbx, rbx
        jne     .LBB0_8
        jmp     .LBB0_11
.LBB0_4:
        mov     rax, rbp
        test    rbx, rbx
        je      .LBB0_11
.LBB0_8:
        cmp     rbx, 1
        jne     .LBB0_10
        mov     cl, byte ptr [r14]
        mov     byte ptr [rax], cl
        jmp     .LBB0_11
.LBB0_10:
        mov     rdi, rax
        mov     rsi, r14
        mov     rdx, rbx
        call    memcpy@PLT
.LBB0_11:
        mov     rax, qword ptr [rsp]
        mov     qword ptr [rsp + 24], rax
        mov     rcx, qword ptr [rsp + 16]
        mov     byte ptr [rcx + rax], 0
        movss   xmm0, dword ptr [rsp + 8]       # xmm0 = mem[0],zero,zero,zero
        cvtss2sd        xmm0, xmm0
        mov     edi, offset std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rsi, qword ptr [rsp + 16]
        mov     rdx, qword ptr [rsp + 24]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     rdi, qword ptr [rsp + 16]
        cmp     rdi, rbp
        je      .LBB0_15
        call    operator delete(void*)
.LBB0_15:
        mov     rdi, qword ptr [rsp + 96]
        cmp     rdi, r13
        je      .LBB0_17
        call    operator delete(void*)
.LBB0_17:
        mov     rdi, qword ptr [rsp + 56]
        cmp     rdi, r12
        je      .LBB0_19
        call    operator delete(void*)
.LBB0_19:
        mov     rdi, qword ptr [rsp + 136]
        cmp     rdi, r15
        je      .LBB0_21
        call    operator delete(void*)
.LBB0_21:
        xor     eax, eax
        add     rsp, 168
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
        mov     rbx, rax
        mov     rdi, qword ptr [rsp + 16]
        cmp     rdi, rbp
        je      .LBB0_25
        call    operator delete(void*)
        jmp     .LBB0_25
        mov     rbx, rax
.LBB0_25:
        mov     rdi, qword ptr [rsp + 96]
        cmp     rdi, r13
        je      .LBB0_27
        call    operator delete(void*)
.LBB0_27:
        mov     rdi, qword ptr [rsp + 56]
        cmp     rdi, r12
        je      .LBB0_29
        call    operator delete(void*)
.LBB0_29:
        mov     rdi, qword ptr [rsp + 136]
        cmp     rdi, r15
        je      .LBB0_31
        call    operator delete(void*)
.LBB0_31:
        mov     rdi, rbx
        call    _Unwind_Resume@PLT
_GLOBAL__sub_I_example.cpp:             # @_GLOBAL__sub_I_example.cpp
        push    rax
        mov     edi, offset std::__ioinit
        call    std::ios_base::Init::Init() [complete object constructor]
        mov     edi, offset std::ios_base::Init::~Init() [complete object destructor]
        mov     esi, offset std::__ioinit
        mov     edx, offset __dso_handle
        pop     rax
        jmp     __cxa_atexit                    # TAILCALL

.L.str.2:
        .asciz  " and "

And Clang is generating 147 lines of assembly.


Here is GCC(trunk) generated assembly with compiler options set to: -std=c++17 -O2

.LC0:
        .string "hrs"
.LC2:
        .string "s"
.LC3:
        .string " and "
main:
        push    rbx
        mov     edx, OFFSET FLAT:.LC0+3
        mov     esi, OFFSET FLAT:.LC0
        sub     rsp, 192
        lea     rax, [rsp+24]
        lea     rdi, [rsp+8]
        mov     DWORD PTR [rsp], 3
        mov     QWORD PTR [rsp+8], rax
        call    void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
        lea     rax, [rsp+72]
        mov     edx, OFFSET FLAT:.LC2+1
        mov     esi, OFFSET FLAT:.LC2
        lea     rdi, [rsp+56]
        mov     DWORD PTR [rsp+48], 0x40200000
        mov     QWORD PTR [rsp+56], rax
        call    void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
        mov     eax, DWORD PTR [rsp]
        lea     rsi, [rsp+8]
        lea     rdi, [rsp+104]
        mov     DWORD PTR [rsp+96], eax
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
        mov     esi, DWORD PTR [rsp+96]
        mov     edi, OFFSET FLAT:_ZSt4cout
        call    std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
        mov     rdx, QWORD PTR [rsp+112]
        mov     rsi, QWORD PTR [rsp+104]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     esi, OFFSET FLAT:.LC3
        mov     edi, OFFSET FLAT:_ZSt4cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)
        movss   xmm0, DWORD PTR [rsp+48]
        lea     rsi, [rsp+56]
        lea     rdi, [rsp+152]
        mov     rbx, rax
        movss   DWORD PTR [rsp+144], xmm0
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
        pxor    xmm0, xmm0
        mov     rdi, rbx
        cvtss2sd        xmm0, DWORD PTR [rsp+144]
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rdx, QWORD PTR [rsp+160]
        mov     rsi, QWORD PTR [rsp+152]
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        lea     rdi, [rsp+152]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+104]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+56]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        lea     rdi, [rsp+8]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        add     rsp, 192
        xor     eax, eax
        pop     rbx
        ret
        mov     rbx, rax
        jmp     .L10
        mov     rbx, rax
        jmp     .L7
        mov     rbx, rax
        jmp     .L8
        mov     rbx, rax
        jmp     .L9
main.cold:
        lea     rdi, [rsp+152]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L8:
        lea     rdi, [rsp+104]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L9:
        lea     rdi, [rsp+56]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L10:
        lea     rdi, [rsp+8]
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
        mov     rdi, rbx
        call    _Unwind_Resume
_GLOBAL__sub_I_main:
        sub     rsp, 8
        mov     edi, OFFSET FLAT:_ZStL8__ioinit
        call    std::ios_base::Init::Init() [complete object constructor]
        mov     edx, OFFSET FLAT:__dso_handle
        mov     esi, OFFSET FLAT:_ZStL8__ioinit
        mov     edi, OFFSET FLAT:_ZNSt8ios_base4InitD1Ev
        add     rsp, 8
        jmp     __cxa_atexit

And this produces 95 lines of assembly instructions.


Finally we get to MSVC- x64 msvc v19.latest with compiler options set to: /std:c++17 /O2 and needless to say it's too large to post here directly so here is the link with the MSVC options Compiler Explorer ...And this amazingly generates a little over 1600 lines of code.


A jump from about 95 to 150 lines of instructions from GCC to Clang is one thing, but the jump to over 1500+ lines is something else. What gives? Is this major difference in generated assembly instructions between the compilers themselves and how MSVC implements their version of the C++17 language or is this related to how Compiler Explorer works with how they implement each compiler? And why is there such a dramatic jump in instructions?

c++

assembly

c++17

compiler-optimization

compiler-explorer

0 Answers

Your Answer

Accepted video resources