1 year ago

#373465

test-img

mas

Why is divsd not much slower than mulsd in this benchmark?

Someone told me the factor of speed differences is about 40x. Curious, I wrote a benchmark. It required some help from greybeards who were more knowledgeable about what might be optimizing out, but after several revisions we just cannot find a meaningful difference between divsd and mulsd.

Here's results:

āžœ  linked gcc main.c && ./a.out 
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 2080434
mul: 1925889
div / mul: 1.080246
0.000000

And with O3:

āžœ  linked gcc main.c -O3 && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 1948388
mul: 1804587
div / mul: 1.079686
0.000000

The code:

#include <time.h>
#include <stdio.h>

#define TRIALS 10000000

int main() {
    double op = 0.0;
    double x = 1.0;
    double const y = 1.21462343468798723984729;
    time_t div_start = clock();
    for (size_t i = 0; i < TRIALS; i++) {
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
    }
    time_t div_end = clock();
    printf("%.16f %.16f\n", x, op);
    time_t div_seconds = div_end - div_start;
    double op2 = 0.0;
    x = 1.0;
    double const z = 1 / y;
    time_t mul_start = clock();
    for (size_t i = 0; i < TRIALS; i++) {
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
    }
    time_t mul_end = clock();
    time_t mul_seconds = mul_end - mul_start;
    printf("%.16f %.16f\n", x, op2);

    // print results as seconds
    printf("div: %ld\nmul: %ld\n", div_seconds, mul_seconds);
    printf("div / mul: %f\n", (double)div_seconds / (double)mul_seconds);
    printf("%f\n", op - op2);

    return 0;
}

The assembly with O3:

    .file   "main.c"
    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC3:
    .string "%.16f %.16f\n"
.LC5:
    .string "div: %ld\nmul: %ld\n"
.LC6:
    .string "div / mul: %f\n"
.LC7:
    .string "%f\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4
    .globl  main
    .type   main, @function
main:
.LFB11:
    .cfi_startproc
    pushq   %r13
    .cfi_def_cfa_offset 16
    .cfi_offset 13, -16
    pushq   %r12
    .cfi_def_cfa_offset 24
    .cfi_offset 12, -24
    pushq   %rbp
    .cfi_def_cfa_offset 32
    .cfi_offset 6, -32
    pushq   %rbx
    .cfi_def_cfa_offset 40
    .cfi_offset 3, -40
    subq    $40, %rsp
    .cfi_def_cfa_offset 80
    call    clock@PLT
    movq    .LC0(%rip), %rcx
    pxor    %xmm2, %xmm2
    movsd   .LC2(%rip), %xmm1
    movq    %rax, %rbp
    movl    $10000000, %eax
    movq    %rcx, %xmm0
    .p2align 4,,10
    .p2align 3
.L2:
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    subq    $1, %rax
    jne .L2
    movsd   %xmm2, 8(%rsp)
    leaq    .LC3(%rip), %r13
    movsd   %xmm0, 16(%rsp)
    call    clock@PLT
    movsd   8(%rsp), %xmm2
    movsd   16(%rsp), %xmm0
    movq    %r13, %rdi
    movq    %rax, %rbx
    movl    $2, %eax
    movapd  %xmm2, %xmm1
    subq    %rbp, %rbx
    call    printf@PLT
    call    clock@PLT
    movq    .LC0(%rip), %rdx
    movsd   8(%rsp), %xmm2
    pxor    %xmm1, %xmm1
    movsd   .LC4(%rip), %xmm3
    movq    %rax, %r12
    movl    $10000000, %eax
    movq    %rdx, %xmm0
    .p2align 4,,10
    .p2align 3
.L3:
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    subq    $1, %rax
    jne .L3
    movsd   %xmm2, 24(%rsp)
    movsd   %xmm1, 8(%rsp)
    movsd   %xmm0, 16(%rsp)
    call    clock@PLT
    movsd   8(%rsp), %xmm1
    movsd   16(%rsp), %xmm0
    movq    %r13, %rdi
    subq    %r12, %rax
    movq    %rax, %rbp
    movl    $2, %eax
    call    printf@PLT
    movq    %rbp, %rdx
    movq    %rbx, %rsi
    xorl    %eax, %eax
    leaq    .LC5(%rip), %rdi
    call    printf@PLT
    pxor    %xmm0, %xmm0
    pxor    %xmm3, %xmm3
    leaq    .LC6(%rip), %rdi
    cvtsi2sdq   %rbp, %xmm3
    movl    $1, %eax
    cvtsi2sdq   %rbx, %xmm0
    divsd   %xmm3, %xmm0
    call    printf@PLT
    movsd   24(%rsp), %xmm2
    movsd   8(%rsp), %xmm1
    leaq    .LC7(%rip), %rdi
    movl    $1, %eax
    subsd   %xmm1, %xmm2
    movapd  %xmm2, %xmm0
    call    printf@PLT
    addq    $40, %rsp
    .cfi_def_cfa_offset 40
    xorl    %eax, %eax
    popq    %rbx
    .cfi_def_cfa_offset 32
    popq    %rbp
    .cfi_def_cfa_offset 24
    popq    %r12
    .cfi_def_cfa_offset 16
    popq    %r13
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE11:
    .size   main, .-main
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long   0
    .long   1072693248
    .align 8
.LC2:
    .long   -74511709
    .long   1072918296
    .align 8
.LC4:
    .long   644960408
    .long   1072322682
    .ident  "GCC: (GNU) 12.2.0"
    .section    .note.GNU-stack,"",@progbits

It's clear divsd is not being optimized out? What am I doing wrong?

c

performance

assembly

floating-point

x86-64

0 Answers

Your Answer

Accepted video resources