#include "ibp.hpp"
constexpr uint64_t Count = 1024 * 1024 * 1024; // 1 GiO
extern "C" void FctAdd(uint64_t Count);
extern "C" void FctMovAdd(uint64_t Count);
int main() {
ibp::comparisonTester({
{"FctAdd", [&]() { FctAdd(Count); }},
{"FctMovAdd", [&]() { FctMovAdd(Count); }}
}, Count, Count, 1);
}
FctAdd:
align 64
.loop:
add rcx, 1
add rcx, 1
dec rdi
jnz .loop
ret
FctMovAdd:
align 64
.loop:
mov rcx, rax
add rcx, 1
mov rcx, rax
add rcx, 1
dec rdi
jnz .loop
ret
Quelle fonction est la plus rapide?
FctAdd
est plus rapideFctMovAdd
est plus rapide Label AvgCycles GB/s Cycles/item Speedup
FctAdd 1679822459 2.62 1.56 1.00x
FctMovAdd 840759587 5.23 0.78 2.00x
Qu’est-ce qui explique la différence de performance?
RCX: $P_0$
Instruction | Dest | Src |
---|---|---|
add rcx, 1 | ||
add rcx, 1 | ||
add rcx, 1 | ||
add rcx, 1 | ||
… |
RCX: $\cancel{P_0},~P_1$
Instruction | Dest | Src |
---|---|---|
add rcx, 1 | $P_1$ | $P_0$ |
add rcx, 1 | ||
add rcx, 1 | ||
add rcx, 1 | ||
… |
RCX: $\cancel{P_0},~\cancel{P_1},~P_2$
Instruction | Dest | Src |
---|---|---|
add rcx, 1 | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
add rcx, 1 | ||
add rcx, 1 | ||
… |
RCX: $\cancel{P_0},~\cancel{P_1},$ $\cancel{P_2},~P_3$
Instruction | Dest | Src |
---|---|---|
add rcx, 1 | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
add rcx, 1 | $P_3$ | $P_2$ |
add rcx, 1 | ||
… |
RCX: $\cancel{P_0}$, $\cancel{P_1}$, $\cancel{P_2}$, $\cancel{P_3},~P_4$
Instruction | Dest | Src |
---|---|---|
add rcx, 1 | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
add rcx, 1 | $P_3$ | $P_2$ |
add rcx, 1 | $P_4$ | $P_3$ |
… | … | … |
RAX: $P_0$ RCX: $\ast$
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | ||
add rcx, 1 | ||
mov rcx, rax | ||
add rcx, 1 | ||
… |
RAX: $P_0$ RCX: $\cancel{\ast},~P_1$
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | $P_1$ | $P_0$ |
add rcx, 1 | ||
mov rcx, rax | ||
add rcx, 1 | ||
… |
RAX: $P_0$ RCX: $\cancel{\ast},~\cancel{P_1},~P_2$
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
mov rcx, rax | ||
add rcx, 1 | ||
… |
RAX: $P_0$ RCX: $\cancel{\ast},~\cancel{P_1}$, $\cancel{P_2},~P_3$
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
mov rcx, rax | $P_3$ | $P_0$ |
add rcx, 1 | ||
… |
RAX: $P_0$ RCX: $\cancel{\ast},~\cancel{P_1}$, $\cancel{P_2}$, $\cancel{P_3}$, $P_4$
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | $P_1$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_1$ |
mov rcx, rax | $P_3$ | $P_0$ |
add rcx, 1 | $P_4$ | $P_3$ |
… |
MOV
Instruction | Dest | Src |
---|---|---|
mov rcx, rax | $P_0$ | $P_0$ |
add rcx, 1 | $P_1$ | $P_0$ |
mov rcx, rax | $P_0$ | $P_0$ |
add rcx, 1 | $P_2$ | $P_0$ |
… |
R1 ← mem[1]
R1 ← R1 + 1
mem[1] ← R1
R1 ← mem[10]
R1 ← R1 + 4
mem[11] ← R1
R1a ← mem[1]
R1b ← R1a + 1
mem[1] ← R1b
R1c ← mem[10]
R1d ← R1c + 4
mem[11] ← R1d
R1a ← mem[1] & R1c ← mem[10]
R1b ← R1a + 1 & R1d ← R1c + 4
mem[1] ← R1b & mem[11] ← R1d