Assembly Code to measure latency and throughput

This code will read latency and throughput of the CPU instructions (Draft)

Using EuroAssembler

EUROASM AutoSegment=Yes, CPU=X64, SIMD=AVX2,  PRIV=ENABLED
latency PROGRAM Format=PE, Width=64, Model=Flat, IconFile=, Entry=main:

INCLUDE winscon.htm, winabi.htm, cpuext64.htm

LINK cvirt.lib

Msg D "TSC Ticks per 1 s = ",0
Buffer DB 80 * B
MsgNL D "---------------------------------------------------------------------------------",10,13,0
MsgLG D "Test | VAL  | IPC  | Instr. | Cycles | TSCnt. | ExecT. | Boost | Base | Frequency",0
FmtT D  "%%s< THR   %%f[p2]   %%f[p2]   %%f[p2] G   %%f[p2] G   %%f[p2] G   %%f[p2] s   x%%f[p2]   %%f[p2]   %%f[p2] GHz",10,13,0
FmtL D  "%%s< LAT   %%f[p2]   %%f[p2]   %%f[p2] G   %%f[p2] G   %%f[p2] G   %%f[p2] s   x%%f[p2]   %%f[p2]   %%f[p2] GHz",10,13,0
;            THR      1.0    IPC  1.0    1.01 G   1.01 G   1.01 G   1.01 G   1.255   x1.25   3.61   3.10

MsgTP D "Throughput Test IMUL r64,r64",0
MsgLA D "Latency Test",0

Msg0 D "Instructions Retired = %%d[b0]",13,10,0
Msg1 D "Actual Cycles = %%d[b0]",13,10,0
Msg2 D "Reference Cycles = %%d[b0]",13,10,0
MsgT D "TSC Ticks = %%d[b0]",13,10,0
ExecTimeFmt D "Execution Time = %%f s",13,10,0
RatioFmt D "CPU Freq Boost Ratio = %%f",13,10,0
LatencyFmt D " Latency = %%f",13,10,0
ThroughFmt D " Throughput = %%f",13,10,0
CPUFreqFmt D " CPU Base Frequency = %%f GHz",13,10,0
Freq DB Q 2_700_000_000
QPFreq DB Q 10_000_000
Gig DB Q 1_000_000_000
Ticks DB Q
BaseFreq DB Q
;Freq DB Q 

TInstr DB Q
TCycl DB Q
TRef DB Q
TTSC DB Q
LInstr DB Q
LCycl DB Q
LRef DB Q
LTSC DB Q

StartBench %MACRO Type, Counter, Destination
	mov ecx, %Counter
	%Type
	shl rdx, 32
	or rax, rdx
	mov r8, rax	
%ENDMACRO

EndBench %MACRO Type, Counter, Message, Min
	mov ecx, %Counter
	%Type
	shl rdx, 32
	or rax, rdx
	sub rax, r8 ; subtract previous
	mov rbx, [%Min]
	; with cmov instruction (technically branchless)
	cmp rax, rbx
	cmovg rax, rbx        ; If rax > rbx, move rbx into rax
	mov [%Min], rax
	WinABI FmtOut, %Message, rax
%ENDMACRO

ThroughputBench %MACRO L
mov r9, 250_000
%L:
i  %FOR  0..1000
	VMPSADBW YMM0, YMM1, YMM2, 0
	VMPSADBW YMM3, YMM4, YMM5, 0
	VMPSADBW YMM6, YMM7, YMM8, 0
	VMPSADBW YMM9, YMM10, YMM11, 0
;    imul r10, r11
;    imul r11, r12
;    imul r12, r13
;	 imul r14, r14
   %ENDFOR
	dec r9
	jnz %L
%ENDMACRO

LatencyBench %MACRO L
mov r9, 250_000
%L:
i  %FOR  0..4000
	VMPSADBW YMM0, YMM0, YMM0, 0
    ;imul r10, r10
   %ENDFOR i
	dec r9
	jnz %L
%ENDMACRO
    
main: nop

mov rax, 100_000_000_000
mov [TInstr], rax
mov [TCycl], rax
mov [TRef], rax
mov [TTSC], rax
mov [LInstr], rax
mov [LCycl], rax
mov [LRef], rax
mov [LTSC], rax

	WinABI SetStdioWindowOptions,0,0
    WinABI GetCurrentThread ; This one will be in RAX
    WinABI SetThreadAffinityMask, RAX, 1 ; RCX is thread param (core#)
    WinABI SetThreadPriority, RAX, 15 ; RCX is thread param (core#)

    WinABI QueryPerformanceFrequency, Freq
	mov r15, [QPFreq]

	WinABI QueryPerformanceCounter, Ticks
	mov r10, [Ticks] ; r10 - initial

	CPUID
    RDTSC
    SHL RDX, 32
    OR RAX, RDX
    MOV R13, RAX ; R13 will hold initial Time Stamp counter value
	
LC:
    RDTSCP
    SHL RDX, 32
    OR RAX, RDX
    SUB RAX, R13 ; Subtract previous stamp
	mov r14, rax ; R14 is the difference

	WinABI QueryPerformanceCounter, Ticks
	mov r11, [Ticks] ; r10 - was initial
	sub r11, r10
	cmp r11, r15 ; Frequency
	jle LC

	mov rax, r14
	mov [Freq], rax

    movq xmm0, rax ; movq xmm0, [numerator]
    movq xmm1, [Gig] ; movq xmm1, [denominator]
    divsd xmm0, xmm1
	movq [BaseFreq], xmm0
	WinABI FmtOut, CPUFreqFmt, xmm0	

	
Loop:
	StdOutput MsgTP, Eol=Yes, Console=Yes

	StartBench RDPMC, 0x40000000
ThroughputBench L1
	EndBench RDPMC, 0x40000000, Msg0, TInstr
	StartBench RDPMC, 0x40000001
ThroughputBench L2
	EndBench RDPMC, 0x40000001, Msg1, TCycl
	StartBench RDPMC, 0x40000002
ThroughputBench L3
	EndBench RDPMC, 0x40000002, Msg2, TRef
	StartBench RDTSC, 0x0
ThroughputBench L4
	EndBench RDTSCP, 0x0, MsgT, TTSC

 	StdOutput MsgLA, Eol=Yes, Console=Yes

	StartBench RDPMC, 0x40000000
LatencyBench L5
	EndBench RDPMC, 0x40000000, Msg0, LInstr
	StartBench RDPMC, 0x40000001
LatencyBench L6
	EndBench RDPMC, 0x40000001, Msg1, LCycl
	StartBench RDPMC, 0x40000002
LatencyBench L7
	EndBench RDPMC, 0x40000002, Msg2, LRef
	StartBench RDTSC, 0x0
LatencyBench L8
	EndBench RDTSCP, 0x0, MsgT, LTSC

;FmtT D  "THR   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]   %f[p2]"

	StdOutput MsgLG, Eol=Yes, Console=Yes ; Legend
    movq xmm15, [Gig] ;

    movq xmm0, [TCycl] ;
    movq xmm1, [TInstr] ;
    divsd xmm0, xmm1    ; Val

    movq xmm1, [TInstr] ;
    movq xmm2, [TCycl] ;
    divsd xmm1, xmm2   ; IPC

    movq xmm2, [TInstr] ;
    divsd xmm2, xmm15   ; Instr (G)

    movq xmm3, [TCycl] ;
    divsd xmm3, xmm15   ; Cycl (G)

    movq xmm4, [TRef] ;
    divsd xmm4, xmm15   ; Ref (G)

    movq xmm5, [TTSC] ;
    divsd xmm5, xmm15   ; TSC (G)

    movq xmm6, [TTSC] ; 
    movq xmm7, [Freq] ;
    divsd xmm6, xmm7

    movq xmm7, [TCycl] ;
    movq xmm8, [TTSC] ; 
    divsd xmm7, xmm8

    movq xmm8, [Freq] ; 
    divsd xmm8, xmm15   ; 
    ;movq xmm9, [TTSC] ; 
    mulsd xmm8, xmm7

    movq xmm9, [Freq] ; 
    divsd xmm9, xmm15   ; TSC (G)

	WinABI FmtOut, FmtT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7, xmm9, xmm8	

    movq xmm0, [LCycl] ; 
    movq xmm1, [LInstr] ; 
    divsd xmm0, xmm1

    movq xmm1, [LInstr] ;
    movq xmm2, [LCycl] ;
    divsd xmm1, xmm2

    movq xmm2, [LInstr] ;
    movq xmm3, [Gig] ;
    divsd xmm2, xmm3

    movq xmm3, [LCycl] ;
    divsd xmm3, xmm15   ; Cycl (G)

    movq xmm4, [LRef] ;
    divsd xmm4, xmm15   ; Ref (G)

    movq xmm5, [LTSC] ;
    divsd xmm5, xmm15   ; TSC (G)

    movq xmm6, [LTSC] ;
    movq xmm7, [Freq] ;
    divsd xmm6, xmm7

    movq xmm7, [LCycl] ;
    movq xmm8, [LTSC] ; 
    divsd xmm7, xmm8

    movq xmm8, [Freq] ; 
    divsd xmm8, xmm15   ;
    mulsd xmm8, xmm7

    movq xmm9, [Freq] ; 
    divsd xmm9, xmm15   ; 

	WinABI FmtOut, FmtL, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7, xmm9, xmm8	

; MsgLG D "Test | VAL | IPC | Instr.   | Cycles  | Ref C. | TSC C. | T, ms | Ratio | Freq | Base",10,13,0

	StdOutput MsgNL, Eol=Yes, Console=Yes
	jmp Loop

TerminateProgram;

ENDPROGRAM

On Intel Haswell for VMPSADBW YMM, YMM, YMM, mask (Latency 6, Throughput 2)

Result:

Test | VAL  | IPC  | Instr. | Cycles | TSCnt. | ExecT. | Boost | Base | Frequency
 THR   2.10   0.48   1.01 G   2.13 G   2.03 G   0.62 s   x1.05   3.29   3.46 GHz
 LAT   6.27   0.16   1.18 G   7.38 G   6.86 G   2.08 s   x1.08   3.29   3.54 GHz
---------------------------------------------------------------------------------