Zero-Cost Abstractions — A Practical Check

I have seen many posts about “zero-cost abstractions.” Let’s look at this more carefully — this makes for a good technical exercise.

Test Setup

Tested with:

  • Rust 1.96.0 (released May 28, 2026)
  • GCC 16.1.0 (released April 30, 2026)

Source Code

Rust

use std::time::Instant;

fn main() {
    // Create 2_147_418_111 integers to get 65536 sum
    let data: Vec<i32> = (0..2_147_418_111).collect();

    // Warm-up
    let _warmup: i32 = data.iter().filter(|x| **x % 2 == 0).map(|x| x * 2).sum();

    // Measure iterator chain
    let start = Instant::now();
    let sum_iter: i32 = data.iter().filter(|x| **x % 2 == 0).map(|x| x * 2).sum();
    let dur_iter = start.elapsed();

    // Measure equivalent for-loop
    let start = Instant::now();
    let mut sum_loop = 0i32;
    for x in &data {
        if x % 2 == 0 {
            sum_loop += x * 2;
        }
    }
    let dur_loop = start.elapsed();

    println!("Iterator sum: {}; time: {:?}", sum_iter, dur_iter);
    println!("Loop sum: {}; time: {:?}", sum_loop, dur_loop);
}

C++

#include <vector>
#include <numeric>
#include <chrono>
#include <fmt/core.h>

using namespace std;
using fmt::print;

int main() {
    vector<int> data;
	data.reserve(2'147'418'111); // Same amount as in Rust

	for (int i = 0; i < data.capacity(); ++i) {
    	data.push_back(i);
	}
    volatile int warmup = accumulate(
        data.begin(), data.end(), 0,
        [](int acc, int x) {
            return (x % 2 == 0) ? acc + x * 2 : acc;
        }
    );

    auto start = chrono::high_resolution_clock::now();
    int sum_iter = accumulate(
        data.begin(), data.end(), 0,
        [](int acc, int x) {
            return (x % 2 == 0) ? acc + x * 2 : acc;
        }
    );
    auto dur_iter = chrono::high_resolution_clock::now() - start;

    start = chrono::high_resolution_clock::now();
    int sum_loop = 0;
    for (int x : data) {
        if (x % 2 == 0) {
            sum_loop += x * 2;
        }
    }
    auto dur_loop = chrono::high_resolution_clock::now() - start;

	print("Iterator sum: {}; time: {}ms\n",
    	sum_iter,
    	chrono::duration<double, milli>(dur_iter).count());

	print("Loop sum: {}; time: {}ms\n",
    	sum_loop,
    	chrono::duration<double, milli>(dur_loop).count());
    return 0;
}

As you can see the Rust code is cleaner and shorter.

Results

On Xeon w5-2445 @ 3.1 GHz:

O3/Releasewith =native
Rust713-714 ms571-572 ms
C++751-752 ms654-655 ms

Rust Release:

>r-bench.exe
Iterator sum: 65536; time: 713.4694 ms
Loop sum: 65536; time: 713.5939 ms

Rust with rustflags = ["-C", “target-cpu=native”]:

>r-perf.exe
Iterator sum: 65536; time: 571.5756ms
Loop sum: 65536; time: 571.1088ms

C++ O3:

g++ -O3 -std=c++20 bench.cpp -o bench -lfmt
>bench.exe
Iterator sum: 65536; time: 751.3733 ms
Loop sum: 65536; time: 751.3685 ms

C++ with -march=native:

g++ -O3 -march=native -std=c++20 bench.cpp -o bench -lfmt
>c-bench.exe
Iterator sum: 65536; time: 655.0348 ms
Loop sum: 65536; time: 654.454 ms

Observations

  • In Rust, the iterator pipeline and the explicit for loop produce identical performance.
  • In C++, the same holds: std::accumulate and a manual loop perform equivalently.
  • Rust is slightly faster in this particular test.

Assembly Analysis

To verify what is happening, we inspect the generated assembly.

Rust:

cargo rustc --release -- --emit asm -C opt-level=3 -C  "llvm-args=-x86-asm-syntax=intel"

GCC:

g++ -O3 -march=native -std=c++20 -S -fverbose-asm -masm=intel bench.cpp

Or, alternatively, get it from built executable with objdump

objdump -d -M intel bench.exe > dump.s

Hot Loops:

let sum_iter: i32 = data.iter().filter(|x| **x % 2 == 0).map(|x| x * 2).sum();
// and
    for x in &data {
        if x % 2 == 0 {
            sum_loop += x * 2;
        }
    }

Both compile into a SIMD loop, but vectorization depends on options:

.LBB4_8:
	movdqu	xmm3, xmmword ptr [r9 + 4*rax - 16]
	movdqu	xmm4, xmmword ptr [r9 + 4*rax]
	movdqa	xmm5, xmm3
	pand	xmm5, xmm6
	movdqa	xmm7, xmm4
	pand	xmm7, xmm6
	paddd	xmm3, xmm3
	paddd	xmm4, xmm4
	pcmpeqd	xmm5, xmm0
	pand	xmm5, xmm3
	paddd	xmm2, xmm5
	pcmpeqd	xmm7, xmm0
	pand	xmm7, xmm4
	paddd	xmm1, xmm7
	add	rax, 8
	cmp	rax, 2147418108
	jne	.LBB4_8
.LBB4_5:
	vmovdqu	ymm4, ymmword ptr [r9 + 4*rax - 224]
	vmovdqu	ymm5, ymmword ptr [r9 + 4*rax - 192]
	vmovdqu64	ymm16, ymmword ptr [r9 + 4*rax - 160]
	vmovdqu64	ymm17, ymmword ptr [r9 + 4*rax - 128]
	vptestnmd	k1, ymm4, ymm18
	vptestnmd	k2, ymm5, ymm18
	vptestnmd	k3, ymm16, ymm18
	vptestnmd	k4, ymm17, ymm18
	vpaddd	ymm4, ymm4, ymm4
	vpaddd	ymm5, ymm5, ymm5
	vpaddd	ymm16, ymm16, ymm16
	vpaddd	ymm17, ymm17, ymm17
	vpaddd	ymm0 {k1}, ymm0, ymm4
	vpaddd	ymm1 {k2}, ymm1, ymm5
	vpaddd	ymm2 {k3}, ymm2, ymm16
	vpaddd	ymm3 {k4}, ymm3, ymm17
	cmp	rax, 2147418104
	je	.LBB4_9
	vmovdqu	ymm4, ymmword ptr [r9 + 4*rax - 96]
	vmovdqu	ymm5, ymmword ptr [r9 + 4*rax - 64]
	vmovdqu64	ymm16, ymmword ptr [r9 + 4*rax - 32]
	vmovdqu64	ymm17, ymmword ptr [r9 + 4*rax]
	vptestnmd	k1, ymm4, ymm18
	vptestnmd	k2, ymm5, ymm18
	vptestnmd	k3, ymm16, ymm18
	vptestnmd	k4, ymm17, ymm18
	vpaddd	ymm4, ymm4, ymm4
	vpaddd	ymm5, ymm5, ymm5
	vpaddd	ymm16, ymm16, ymm16
	vpaddd	ymm17, ymm17, ymm17
	vpaddd	ymm0 {k1}, ymm0, ymm4
	vpaddd	ymm1 {k2}, ymm1, ymm5
	vpaddd	ymm2 {k3}, ymm2, ymm16
	vpaddd	ymm3 {k4}, ymm3, ymm17
	add	rax, 64
	jmp	.LBB4_5
.L30:
	movdqu	xmm1, XMMWORD PTR [rsi]
	add	rsi, 16
	movdqa	xmm0, xmm1
	pslld	xmm1, 1
	pand	xmm0, xmm6
	pcmpeqd	xmm0, xmm7
	pand	xmm0, xmm1
	paddd	xmm2, xmm0
	cmp	rsi, rax
	jne	.L30
	movdqa	xmm0, xmm2
	sal	rcx, 2
	psrldq	xmm0, 8
	paddd	xmm2, xmm0
	movdqa	xmm0, xmm2
	psrldq	xmm0, 4
	paddd	xmm2, xmm0
	movd	edi, xmm2
	cmp	rcx, r15
	je	.L31
.L39:
	vmovdqu	ymm0, YMMWORD PTR [rsi]
	add	rsi, 32
	vpandd	ymm2, ymm0, ymm3
	vpslld	ymm0, ymm0, 1
	vptestnmd	k1, ymm2, ymm2
	vpaddd	ymm1{k1}, ymm1, ymm0
	cmp	rax, rsi
	jne	.L39

The timing difference comes from vectorization:

  • with default optimization both emits 128-bit SIMD (xmm) instructions
  • with native both emits AVX2 256-bit SIMD (ymm) instructions
  • Rust perform better optimization with unrolling and slightly faster.

👉 Wider SIMD registers allow processing more elements per iteration, resulting in slightly better throughput.