1 year ago
#20910
Francis Cugler
Comparing C++17 Compilers and their generated optimized assembly
Here's a basic working C++ program.
#include <string>
#include <string_view>
#include <iostream>
template<typename ValueType>
struct BasicType {
ValueType value{};
std::string type{};
BasicType(ValueType T, const std::string_view desc) :
value{T}, type{desc} {}
};
template<typename T>
std::ostream& operator<<(std::ostream& out, const BasicType<T> BT ) {
out << BT.value << BT.type;
return out;
}
struct IntT : public BasicType<int> {
IntT(int value, const std::string_view desc) :
BasicType(value, desc)
{}
};
struct FloatT : public BasicType<float> {
FloatT(float value, const std::string_view desc) :
BasicType(value, desc)
{}
};
int main() {
IntT hours(3, "hrs");
FloatT seconds(2.5f, "s");
std::cout << hours << " and " << seconds;
return 0;
}
And it's output is quite obvious
3hrs and 2.5s
There is no issue with the code and it's just a test sample. I was however curious as to how some of the major different compilers treat this code differently to better understand their internal workings and the pros/cons of each compiler.
I used Compiler Explorer to test this out just to look at the differences in the generated assembly. I'm compiling each under c++17
with O2
optimization for performance speed rather than O1
for code size.
Here's the generated assembly from Clang(trunk) with compiler flags set to: -std=C++17 -O2
main: # @main
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 168
mov dword ptr [rsp + 128], 3
lea r15, [rsp + 152]
mov qword ptr [rsp + 136], r15
mov dword ptr [rsp + 152], 7565928
mov qword ptr [rsp + 144], 3
mov dword ptr [rsp + 48], 1075838976
lea r12, [rsp + 72]
mov qword ptr [rsp + 56], r12
mov word ptr [rsp + 72], 115
mov qword ptr [rsp + 64], 1
mov dword ptr [rsp + 88], 3
lea r13, [rsp + 112]
mov qword ptr [rsp + 96], r13
mov dword ptr [rsp + 112], 7565928
mov qword ptr [rsp + 104], 3
mov edi, offset std::cout
mov esi, 3
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov rsi, qword ptr [rsp + 96]
mov rdx, qword ptr [rsp + 104]
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov edi, offset std::cout
mov esi, offset .L.str.2
mov edx, 5
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
movss xmm0, dword ptr [rsp + 48] # xmm0 = mem[0],zero,zero,zero
movss dword ptr [rsp + 8], xmm0
lea rbp, [rsp + 32]
mov qword ptr [rsp + 16], rbp
mov r14, qword ptr [rsp + 56]
mov rbx, qword ptr [rsp + 64]
mov qword ptr [rsp], rbx
cmp rbx, 15
jbe .LBB0_4
lea rdi, [rsp + 16]
mov rsi, rsp
xor edx, edx
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_create(unsigned long&, unsigned long)
mov qword ptr [rsp + 16], rax
mov rcx, qword ptr [rsp]
mov qword ptr [rsp + 32], rcx
test rbx, rbx
jne .LBB0_8
jmp .LBB0_11
.LBB0_4:
mov rax, rbp
test rbx, rbx
je .LBB0_11
.LBB0_8:
cmp rbx, 1
jne .LBB0_10
mov cl, byte ptr [r14]
mov byte ptr [rax], cl
jmp .LBB0_11
.LBB0_10:
mov rdi, rax
mov rsi, r14
mov rdx, rbx
call memcpy@PLT
.LBB0_11:
mov rax, qword ptr [rsp]
mov qword ptr [rsp + 24], rax
mov rcx, qword ptr [rsp + 16]
mov byte ptr [rcx + rax], 0
movss xmm0, dword ptr [rsp + 8] # xmm0 = mem[0],zero,zero,zero
cvtss2sd xmm0, xmm0
mov edi, offset std::cout
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
mov rsi, qword ptr [rsp + 16]
mov rdx, qword ptr [rsp + 24]
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov rdi, qword ptr [rsp + 16]
cmp rdi, rbp
je .LBB0_15
call operator delete(void*)
.LBB0_15:
mov rdi, qword ptr [rsp + 96]
cmp rdi, r13
je .LBB0_17
call operator delete(void*)
.LBB0_17:
mov rdi, qword ptr [rsp + 56]
cmp rdi, r12
je .LBB0_19
call operator delete(void*)
.LBB0_19:
mov rdi, qword ptr [rsp + 136]
cmp rdi, r15
je .LBB0_21
call operator delete(void*)
.LBB0_21:
xor eax, eax
add rsp, 168
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
mov rbx, rax
mov rdi, qword ptr [rsp + 16]
cmp rdi, rbp
je .LBB0_25
call operator delete(void*)
jmp .LBB0_25
mov rbx, rax
.LBB0_25:
mov rdi, qword ptr [rsp + 96]
cmp rdi, r13
je .LBB0_27
call operator delete(void*)
.LBB0_27:
mov rdi, qword ptr [rsp + 56]
cmp rdi, r12
je .LBB0_29
call operator delete(void*)
.LBB0_29:
mov rdi, qword ptr [rsp + 136]
cmp rdi, r15
je .LBB0_31
call operator delete(void*)
.LBB0_31:
mov rdi, rbx
call _Unwind_Resume@PLT
_GLOBAL__sub_I_example.cpp: # @_GLOBAL__sub_I_example.cpp
push rax
mov edi, offset std::__ioinit
call std::ios_base::Init::Init() [complete object constructor]
mov edi, offset std::ios_base::Init::~Init() [complete object destructor]
mov esi, offset std::__ioinit
mov edx, offset __dso_handle
pop rax
jmp __cxa_atexit # TAILCALL
.L.str.2:
.asciz " and "
And Clang is generating 147 lines of assembly.
Here is GCC(trunk) generated assembly with compiler options set to: -std=c++17 -O2
.LC0:
.string "hrs"
.LC2:
.string "s"
.LC3:
.string " and "
main:
push rbx
mov edx, OFFSET FLAT:.LC0+3
mov esi, OFFSET FLAT:.LC0
sub rsp, 192
lea rax, [rsp+24]
lea rdi, [rsp+8]
mov DWORD PTR [rsp], 3
mov QWORD PTR [rsp+8], rax
call void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
lea rax, [rsp+72]
mov edx, OFFSET FLAT:.LC2+1
mov esi, OFFSET FLAT:.LC2
lea rdi, [rsp+56]
mov DWORD PTR [rsp+48], 0x40200000
mov QWORD PTR [rsp+56], rax
call void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char const*>(char const*, char const*, std::forward_iterator_tag) [clone .isra.0]
mov eax, DWORD PTR [rsp]
lea rsi, [rsp+8]
lea rdi, [rsp+104]
mov DWORD PTR [rsp+96], eax
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
mov esi, DWORD PTR [rsp+96]
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov rdx, QWORD PTR [rsp+112]
mov rsi, QWORD PTR [rsp+104]
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov esi, OFFSET FLAT:.LC3
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)
movss xmm0, DWORD PTR [rsp+48]
lea rsi, [rsp+56]
lea rdi, [rsp+152]
mov rbx, rax
movss DWORD PTR [rsp+144], xmm0
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) [complete object constructor]
pxor xmm0, xmm0
mov rdi, rbx
cvtss2sd xmm0, DWORD PTR [rsp+144]
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
mov rdx, QWORD PTR [rsp+160]
mov rsi, QWORD PTR [rsp+152]
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
lea rdi, [rsp+152]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
lea rdi, [rsp+104]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
lea rdi, [rsp+56]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
lea rdi, [rsp+8]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
add rsp, 192
xor eax, eax
pop rbx
ret
mov rbx, rax
jmp .L10
mov rbx, rax
jmp .L7
mov rbx, rax
jmp .L8
mov rbx, rax
jmp .L9
main.cold:
lea rdi, [rsp+152]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L8:
lea rdi, [rsp+104]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L9:
lea rdi, [rsp+56]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L10:
lea rdi, [rsp+8]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
mov rdi, rbx
call _Unwind_Resume
_GLOBAL__sub_I_main:
sub rsp, 8
mov edi, OFFSET FLAT:_ZStL8__ioinit
call std::ios_base::Init::Init() [complete object constructor]
mov edx, OFFSET FLAT:__dso_handle
mov esi, OFFSET FLAT:_ZStL8__ioinit
mov edi, OFFSET FLAT:_ZNSt8ios_base4InitD1Ev
add rsp, 8
jmp __cxa_atexit
And this produces 95 lines of assembly instructions.
Finally we get to MSVC- x64 msvc v19.latest with compiler options set to: /std:c++17 /O2
and needless to say it's too large to post here directly so here is the link with the MSVC options Compiler Explorer ...And this amazingly generates a little over 1600 lines of code.
A jump from about 95 to 150 lines of instructions from GCC to Clang is one thing, but the jump to over 1500+ lines is something else. What gives? Is this major difference in generated assembly instructions between the compilers themselves and how MSVC implements their version of the C++17 language or is this related to how Compiler Explorer works with how they implement each compiler? And why is there such a dramatic jump in instructions?
c++
assembly
c++17
compiler-optimization
compiler-explorer
0 Answers
Your Answer