1 year ago
#373465
mas
Why is divsd not much slower than mulsd in this benchmark?
Someone told me the factor of speed differences is about 40x. Curious, I wrote a benchmark. It required some help from greybeards who were more knowledgeable about what might be optimizing out, but after several revisions we just cannot find a meaningful difference between divsd and mulsd.
Here's results:
ā linked gcc main.c && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 2080434
mul: 1925889
div / mul: 1.080246
0.000000
And with O3:
ā linked gcc main.c -O3 && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 1948388
mul: 1804587
div / mul: 1.079686
0.000000
The code:
#include <time.h>
#include <stdio.h>
#define TRIALS 10000000
int main() {
double op = 0.0;
double x = 1.0;
double const y = 1.21462343468798723984729;
time_t div_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
}
time_t div_end = clock();
printf("%.16f %.16f\n", x, op);
time_t div_seconds = div_end - div_start;
double op2 = 0.0;
x = 1.0;
double const z = 1 / y;
time_t mul_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
}
time_t mul_end = clock();
time_t mul_seconds = mul_end - mul_start;
printf("%.16f %.16f\n", x, op2);
// print results as seconds
printf("div: %ld\nmul: %ld\n", div_seconds, mul_seconds);
printf("div / mul: %f\n", (double)div_seconds / (double)mul_seconds);
printf("%f\n", op - op2);
return 0;
}
The assembly with O3:
.file "main.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC3:
.string "%.16f %.16f\n"
.LC5:
.string "div: %ld\nmul: %ld\n"
.LC6:
.string "div / mul: %f\n"
.LC7:
.string "%f\n"
.section .text.startup,"ax",@progbits
.p2align 4
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
pushq %r12
.cfi_def_cfa_offset 24
.cfi_offset 12, -24
pushq %rbp
.cfi_def_cfa_offset 32
.cfi_offset 6, -32
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset 3, -40
subq $40, %rsp
.cfi_def_cfa_offset 80
call clock@PLT
movq .LC0(%rip), %rcx
pxor %xmm2, %xmm2
movsd .LC2(%rip), %xmm1
movq %rax, %rbp
movl $10000000, %eax
movq %rcx, %xmm0
.p2align 4,,10
.p2align 3
.L2:
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
subq $1, %rax
jne .L2
movsd %xmm2, 8(%rsp)
leaq .LC3(%rip), %r13
movsd %xmm0, 16(%rsp)
call clock@PLT
movsd 8(%rsp), %xmm2
movsd 16(%rsp), %xmm0
movq %r13, %rdi
movq %rax, %rbx
movl $2, %eax
movapd %xmm2, %xmm1
subq %rbp, %rbx
call printf@PLT
call clock@PLT
movq .LC0(%rip), %rdx
movsd 8(%rsp), %xmm2
pxor %xmm1, %xmm1
movsd .LC4(%rip), %xmm3
movq %rax, %r12
movl $10000000, %eax
movq %rdx, %xmm0
.p2align 4,,10
.p2align 3
.L3:
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
subq $1, %rax
jne .L3
movsd %xmm2, 24(%rsp)
movsd %xmm1, 8(%rsp)
movsd %xmm0, 16(%rsp)
call clock@PLT
movsd 8(%rsp), %xmm1
movsd 16(%rsp), %xmm0
movq %r13, %rdi
subq %r12, %rax
movq %rax, %rbp
movl $2, %eax
call printf@PLT
movq %rbp, %rdx
movq %rbx, %rsi
xorl %eax, %eax
leaq .LC5(%rip), %rdi
call printf@PLT
pxor %xmm0, %xmm0
pxor %xmm3, %xmm3
leaq .LC6(%rip), %rdi
cvtsi2sdq %rbp, %xmm3
movl $1, %eax
cvtsi2sdq %rbx, %xmm0
divsd %xmm3, %xmm0
call printf@PLT
movsd 24(%rsp), %xmm2
movsd 8(%rsp), %xmm1
leaq .LC7(%rip), %rdi
movl $1, %eax
subsd %xmm1, %xmm2
movapd %xmm2, %xmm0
call printf@PLT
addq $40, %rsp
.cfi_def_cfa_offset 40
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 32
popq %rbp
.cfi_def_cfa_offset 24
popq %r12
.cfi_def_cfa_offset 16
popq %r13
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC0:
.long 0
.long 1072693248
.align 8
.LC2:
.long -74511709
.long 1072918296
.align 8
.LC4:
.long 644960408
.long 1072322682
.ident "GCC: (GNU) 12.2.0"
.section .note.GNU-stack,"",@progbits
It's clear divsd is not being optimized out? What am I doing wrong?
c
performance
assembly
floating-point
x86-64
0 Answers
Your Answer