Assembly Code to measure latency and throughput
This code will read latency and throughput of the CPU instructions (Draft)
Using EuroAssembler
EUROASM AutoSegment=Yes, CPU=X64, SIMD=AVX2, PRIV=ENABLED
latency PROGRAM Format=PE, Width=64, Model=Flat, IconFile=, Entry=main:
INCLUDE winscon.htm, winabi.htm, cpuext64.htm
LINK cvirt.lib
Msg D "TSC Ticks per 1 s = ",0
Buffer DB 80 * B
MsgNL D "---------------------------------------------------------------------------------",10,13,0
MsgLG D "Test | VAL | IPC | Instr. | Cycles | TSCnt. | ExecT. | Boost | Base | Frequency",0
FmtT D "%%s< THR %%f[p2] %%f[p2] %%f[p2] G %%f[p2] G %%f[p2] G %%f[p2] s x%%f[p2] %%f[p2] %%f[p2] GHz",10,13,0
FmtL D "%%s< LAT %%f[p2] %%f[p2] %%f[p2] G %%f[p2] G %%f[p2] G %%f[p2] s x%%f[p2] %%f[p2] %%f[p2] GHz",10,13,0
; THR 1.0 IPC 1.0 1.01 G 1.01 G 1.01 G 1.01 G 1.255 x1.25 3.61 3.10
MsgTP D "Throughput Test IMUL r64,r64",0
MsgLA D "Latency Test",0
Msg0 D "Instructions Retired = %%d[b0]",13,10,0
Msg1 D "Actual Cycles = %%d[b0]",13,10,0
Msg2 D "Reference Cycles = %%d[b0]",13,10,0
MsgT D "TSC Ticks = %%d[b0]",13,10,0
ExecTimeFmt D "Execution Time = %%f s",13,10,0
RatioFmt D "CPU Freq Boost Ratio = %%f",13,10,0
LatencyFmt D " Latency = %%f",13,10,0
ThroughFmt D " Throughput = %%f",13,10,0
CPUFreqFmt D " CPU Base Frequency = %%f GHz",13,10,0
Freq DB Q 2_700_000_000
QPFreq DB Q 10_000_000
Gig DB Q 1_000_000_000
Ticks DB Q
BaseFreq DB Q
;Freq DB Q
TInstr DB Q
TCycl DB Q
TRef DB Q
TTSC DB Q
LInstr DB Q
LCycl DB Q
LRef DB Q
LTSC DB Q
StartBench %MACRO Type, Counter, Destination
mov ecx, %Counter
%Type
shl rdx, 32
or rax, rdx
mov r8, rax
%ENDMACRO
EndBench %MACRO Type, Counter, Message, Min
mov ecx, %Counter
%Type
shl rdx, 32
or rax, rdx
sub rax, r8 ; subtract previous
mov rbx, [%Min]
; with cmov instruction (technically branchless)
cmp rax, rbx
cmovg rax, rbx ; If rax > rbx, move rbx into rax
mov [%Min], rax
WinABI FmtOut, %Message, rax
%ENDMACRO
ThroughputBench %MACRO L
mov r9, 250_000
%L:
i %FOR 0..1000
VMPSADBW YMM0, YMM1, YMM2, 0
VMPSADBW YMM3, YMM4, YMM5, 0
VMPSADBW YMM6, YMM7, YMM8, 0
VMPSADBW YMM9, YMM10, YMM11, 0
; imul r10, r11
; imul r11, r12
; imul r12, r13
; imul r14, r14
%ENDFOR
dec r9
jnz %L
%ENDMACRO
LatencyBench %MACRO L
mov r9, 250_000
%L:
i %FOR 0..4000
VMPSADBW YMM0, YMM0, YMM0, 0
;imul r10, r10
%ENDFOR i
dec r9
jnz %L
%ENDMACRO
main: nop
mov rax, 100_000_000_000
mov [TInstr], rax
mov [TCycl], rax
mov [TRef], rax
mov [TTSC], rax
mov [LInstr], rax
mov [LCycl], rax
mov [LRef], rax
mov [LTSC], rax
WinABI SetStdioWindowOptions,0,0
WinABI GetCurrentThread ; This one will be in RAX
WinABI SetThreadAffinityMask, RAX, 1 ; RCX is thread param (core#)
WinABI SetThreadPriority, RAX, 15 ; RCX is thread param (core#)
WinABI QueryPerformanceFrequency, Freq
mov r15, [QPFreq]
WinABI QueryPerformanceCounter, Ticks
mov r10, [Ticks] ; r10 - initial
CPUID
RDTSC
SHL RDX, 32
OR RAX, RDX
MOV R13, RAX ; R13 will hold initial Time Stamp counter value
LC:
RDTSCP
SHL RDX, 32
OR RAX, RDX
SUB RAX, R13 ; Subtract previous stamp
mov r14, rax ; R14 is the difference
WinABI QueryPerformanceCounter, Ticks
mov r11, [Ticks] ; r10 - was initial
sub r11, r10
cmp r11, r15 ; Frequency
jle LC
mov rax, r14
mov [Freq], rax
movq xmm0, rax ; movq xmm0, [numerator]
movq xmm1, [Gig] ; movq xmm1, [denominator]
divsd xmm0, xmm1
movq [BaseFreq], xmm0
WinABI FmtOut, CPUFreqFmt, xmm0
Loop:
StdOutput MsgTP, Eol=Yes, Console=Yes
StartBench RDPMC, 0x40000000
ThroughputBench L1
EndBench RDPMC, 0x40000000, Msg0, TInstr
StartBench RDPMC, 0x40000001
ThroughputBench L2
EndBench RDPMC, 0x40000001, Msg1, TCycl
StartBench RDPMC, 0x40000002
ThroughputBench L3
EndBench RDPMC, 0x40000002, Msg2, TRef
StartBench RDTSC, 0x0
ThroughputBench L4
EndBench RDTSCP, 0x0, MsgT, TTSC
StdOutput MsgLA, Eol=Yes, Console=Yes
StartBench RDPMC, 0x40000000
LatencyBench L5
EndBench RDPMC, 0x40000000, Msg0, LInstr
StartBench RDPMC, 0x40000001
LatencyBench L6
EndBench RDPMC, 0x40000001, Msg1, LCycl
StartBench RDPMC, 0x40000002
LatencyBench L7
EndBench RDPMC, 0x40000002, Msg2, LRef
StartBench RDTSC, 0x0
LatencyBench L8
EndBench RDTSCP, 0x0, MsgT, LTSC
;FmtT D "THR %f[p2] %f[p2] %f[p2] %f[p2] %f[p2] %f[p2] %f[p2] %f[p2] %f[p2] %f[p2]"
StdOutput MsgLG, Eol=Yes, Console=Yes ; Legend
movq xmm15, [Gig] ;
movq xmm0, [TCycl] ;
movq xmm1, [TInstr] ;
divsd xmm0, xmm1 ; Val
movq xmm1, [TInstr] ;
movq xmm2, [TCycl] ;
divsd xmm1, xmm2 ; IPC
movq xmm2, [TInstr] ;
divsd xmm2, xmm15 ; Instr (G)
movq xmm3, [TCycl] ;
divsd xmm3, xmm15 ; Cycl (G)
movq xmm4, [TRef] ;
divsd xmm4, xmm15 ; Ref (G)
movq xmm5, [TTSC] ;
divsd xmm5, xmm15 ; TSC (G)
movq xmm6, [TTSC] ;
movq xmm7, [Freq] ;
divsd xmm6, xmm7
movq xmm7, [TCycl] ;
movq xmm8, [TTSC] ;
divsd xmm7, xmm8
movq xmm8, [Freq] ;
divsd xmm8, xmm15 ;
;movq xmm9, [TTSC] ;
mulsd xmm8, xmm7
movq xmm9, [Freq] ;
divsd xmm9, xmm15 ; TSC (G)
WinABI FmtOut, FmtT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7, xmm9, xmm8
movq xmm0, [LCycl] ;
movq xmm1, [LInstr] ;
divsd xmm0, xmm1
movq xmm1, [LInstr] ;
movq xmm2, [LCycl] ;
divsd xmm1, xmm2
movq xmm2, [LInstr] ;
movq xmm3, [Gig] ;
divsd xmm2, xmm3
movq xmm3, [LCycl] ;
divsd xmm3, xmm15 ; Cycl (G)
movq xmm4, [LRef] ;
divsd xmm4, xmm15 ; Ref (G)
movq xmm5, [LTSC] ;
divsd xmm5, xmm15 ; TSC (G)
movq xmm6, [LTSC] ;
movq xmm7, [Freq] ;
divsd xmm6, xmm7
movq xmm7, [LCycl] ;
movq xmm8, [LTSC] ;
divsd xmm7, xmm8
movq xmm8, [Freq] ;
divsd xmm8, xmm15 ;
mulsd xmm8, xmm7
movq xmm9, [Freq] ;
divsd xmm9, xmm15 ;
WinABI FmtOut, FmtL, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7, xmm9, xmm8
; MsgLG D "Test | VAL | IPC | Instr. | Cycles | Ref C. | TSC C. | T, ms | Ratio | Freq | Base",10,13,0
StdOutput MsgNL, Eol=Yes, Console=Yes
jmp Loop
TerminateProgram;
ENDPROGRAM
On Intel Haswell for VMPSADBW YMM, YMM, YMM, mask (Latency 6, Throughput 2)
Result:
Test | VAL | IPC | Instr. | Cycles | TSCnt. | ExecT. | Boost | Base | Frequency
THR 2.10 0.48 1.01 G 2.13 G 2.03 G 0.62 s x1.05 3.29 3.46 GHz
LAT 6.27 0.16 1.18 G 7.38 G 6.86 G 2.08 s x1.08 3.29 3.54 GHz
---------------------------------------------------------------------------------