10 Ways to Add Two Large Arrays in Rust Using ndarray
Rust’s ndarray crate is a powerful foundation for numerical and scientific computing — but just like with any data‑heavy workload, performance depends heavily on how you write your loops.
To explore this, I built a small educational demo that evaluates 10 different techniques for adding two 2D matrices (Array2
ranging from:
✅ Plain nested loops ✅ Slice‑based linear iteration ✅ indexed_iter_mut() ✅ Zip::for_each ✅ Built‑in ndarray operators ✅ In‑place operations ✅ Unsafe unchecked indexing ✅ SIMD (AVX2) ✅ SIMD + Rayon parallelism (fastest)
Each method is benchmarked on a 1024×1024 or 4096×4096 array depending on build mode.
Github: https://github.com/AndrDm/r-ndarray
Code Snippet
[package]
name = "r-arr2"
version = "0.1.0"
edition = "2024"
[dependencies]
ndarray = { version = "0.17.2", features = ["rayon"] }
rayon = "1.11.0"
wide = "1.2.0"
//===================================================================================
//
// Title: ndarray Array Addition Examples (Educational Demo)
// Purpose: Demonstrate multiple ways to add two 2D Array2<f32> matrices
// in Rust using ndarray, including:
// ✅ Plain nested loops
// ✅ Slice-based linear iteration
// ✅ indexed_iter_mut()
// ✅ Zip (sequential and parallel)
// ✅ Built‑in ndarray operators
// ✅ In‑place arithmetic
// ✅ Unsafe unchecked indexing
// ✅ SIMD using wide::f32x8 + Parallel version
//
// Created on: 09.04.2026 at 08:45:15 by AD.
//
//===================================================================================
use ndarray::{Array2, Zip};
use rayon::{
// (parallel slice + iterator traits)
iter::{IndexedParallelIterator, ParallelIterator},
slice::{ParallelSlice, ParallelSliceMut},
};
use std::time::Instant; // From Standard library
use wide::f32x8; // for SIMD
fn main() {
// 1024x1024 = 1.048.576 elements (4 MiB); 4096x4096 = 16.777.216 elts (64 MiB)
const ROWS: usize = if cfg!(debug_assertions) { 1024 } else { 4096 };
const COLS: usize = if cfg!(debug_assertions) { 1024 } else { 4096 };
println!("Hello, world of ndarray ({}x{})!", ROWS, COLS);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
// res = a + b;
let t1 = Instant::now();
//===============================================================================
// 1) Simple, explicit nested for loops (row/column loops)
// This version is closest to what you'd write in C — very easy.
// ✅ Easiest to read
// ✅ Clear memory indexing
// ❌ Average performance and least idiomatic
//
for i in 0..ROWS {
for j in 0..COLS {
res[(i, j)] = a[(i, j)] + b[(i, j)];
}
}
println!(
"Done 1 (plain loops)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t1.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let t2 = Instant::now();
//============================================================================I==
// 2) Using built‑in ndarray element‑wise addition
// If the goal is “just add arrays”, this is the canonical one-liner.
// ✅ Very short
// ✅ Uses ndarray’s operator overloading
// ❌ Creates a new array (cannot reuse res)
// ❌ Not good when you need in-place updates
//
let res = &a + &b;
println!(
"Done 2 (built-in)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t2.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t3 = Instant::now();
//===============================================================================
// 3) Single for loop using .as_slice().unwrap()
// This flattens the arrays and performs element wise addition with one loop.
// ✅ One loop only
// ✅ Very fast — linear memory access
// ❌ Requires contiguous arrays (Array2 from from_elem is contiguous it's fine)
//
let a_slice = a.as_slice().unwrap();
let b_slice = b.as_slice().unwrap();
let res_slice = res.as_slice_mut().unwrap();
for i in 0..a_slice.len() {
res_slice[i] = a_slice[i] + b_slice[i];
}
println!(
"Done 3 (slices)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t3.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t4 = Instant::now();
//===============================================================================
// 4) Using .indexed_iter_mut() (clean, explicit, safe)
// ✅ Cleaner than nested loops — no manual outer/inner loops
// ✅ Still very explicit: you see the indices
// ✅ Works regardless of array memory layout (unlike .as_slice())
// ✅ More Rust‑idiomatic than raw index loops
// ❌ Slightly slower than Zip
// Because each iteration does two bounds‑checked a[(i, j)] lookups.
// (You can remove bounds checks by using uget() if desired.)
//
for ((i, j), r) in res.indexed_iter_mut() {
*r = a[(i, j)] + b[(i, j)];
}
println!(
"Done 4 (indexed_iter)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t4.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t5a = Instant::now();
//==========================================================================P=I==
// 5A) Idiomatic Rust using ndarray::Zip (recommended!)
// This is the cleanest and most idiomatic way in ndarray.
// ✅ Most idiomatic
// ✅ Auto‑vectorization friendly
// ✅ Handles broadcasting
// ✅ No manual indexing
//
Zip::from(&mut res).and(&a).and(&b).for_each(|r, &x, &y| {
*r = x + y;
});
println!(
"Done 5A (for_each)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t5a.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t5b = Instant::now();
//==========================================================================P=I==
// 5B) Idiomatic Rust using ndarray::Zip Zip with broadcasting example
//
Zip::from(&mut res).and(&a).and(&b).for_each(|r, &x, &y| *r = x + y);
println!(
"Done 5B (for_each)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t5b.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let t6a = Instant::now();
//============================================================================I==
// 6A) In‑place addition using +=
// ✅ Very idiomatic
// ✅ Zero manual loops
// ✅ Leverages ndarray’s broadcasting + in-place math
// ✅ Usually fast
// ❌ Requires ownership over res
//
let mut res = a.clone();
res += &b;
println!(
"Done 6A (inplace +=)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t6a.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t6b = Instant::now();
//============================================================================I==
// 6B) In‑place addition using += without clone
//
res += &a;
res += &b;
println!(
"Done 6B (inplace +=)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t6b.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t7 = Instant::now();
//==========================================================================P====
// 7) Parallel version (educational for parallelism)
// ✅ Shows how to parallelize data‑parallel loops
// ✅ Very appealing for performance education
// ❌ Requires enabling features = ["rayon"] in Cargo.toml:
// ndarray = { version = "0.17.2", features = ["rayon"] }
// When the rayon feature is enabled, ndarray does add parallel iterator traits
// Do not need to import the parallel prelude explicitly when using par_for_each
// ❌ Does NOT use explicit SIMD, but compiler may autovectorize
//
Zip::from(&mut res).and(&a).and(&b).par_for_each(|r, &x, &y| {
*r = x + y;
});
println!(
"Done 7 (par_for_each)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t7.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t8 = Instant::now();
//==========================================================================P====
// 8) Unsafe fast version using unchecked indexing
// ✅ Shows how to skip bounds checks
// ✅ Good for “performance tuning” lessons
// ❌ Not recommended for beginners
// ⚠️ unsafe!
//
for ((i, j), r) in res.indexed_iter_mut() {
unsafe {
*r = *a.uget([i, j]) + *b.uget([i, j]);
}
}
println!(
"Done 8 (unsafe)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t8.elapsed().as_millis()
);
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t9 = Instant::now();
//===============================================================================
// 9) SIMD Version using wide::f32x8 (AVX2) and not std::simd
// ✅ Fastest scalar-friendly method (explicit SIMD)
// ✅ Uses portable vectors on stable Rust
// ✅ Processes 8 elements per iteration
// ❌ Requires arrays to be contiguous (they are here)
// ℹ️ Note: You can change LANES to 16 on CPUs supporting AVX512.
//
const LANES: usize = 8;
let a_slice = a.as_slice().unwrap();
let b_slice = b.as_slice().unwrap();
let res_slice = res.as_slice_mut().unwrap();
let len = a_slice.len();
let width = LANES;
let chunks = len / width;
for chunk in 0..chunks {
let idx = chunk * width;
// Load 8 floats into arrays
let va = f32x8::new(a_slice[idx..idx + LANES].try_into().unwrap());
let vb = f32x8::new(b_slice[idx..idx + LANES].try_into().unwrap());
let vr = va + vb; // ADD
// Store result back
let arr = vr.to_array();
res_slice[idx..idx + width].copy_from_slice(&arr);
}
// Scalar tail
for i in (chunks * width)..len {
res_slice[i] = a_slice[i] + b_slice[i];
}
println!(
"Done 9 (SIMD AVX2)\t- res[0][0] = {}, time = {} ms",
res[(0, 0)],
t9.elapsed().as_millis()
);
//===============================================================================
// ✅ Option: helper function (tiny and clean)
#[inline]
fn load8(slice: &[f32], idx: usize) -> f32x8 {
f32x8::new(slice[idx..idx + 8].try_into().unwrap())
}
//===============================================================================
//✅ Option: a neat macro (closest to real SIMD “load” semantics)
macro_rules! load8 {
($slice:expr, $i:expr) => {
f32x8::new($slice[$i..$i + 8].try_into().unwrap())
};
}
let a = Array2::<f32>::from_elem((ROWS, COLS), 1.0);
let b = Array2::<f32>::from_elem((ROWS, COLS), 2.0);
let mut res = Array2::<f32>::zeros((ROWS, COLS));
let t10 = Instant::now();
//===============================================================================
// 10) SIMD + Parallel version (SIMD on all CPU threads)
// ✅ Uses Rayon to split work across CPU cores
// ✅ Uses wide::f32x8 SIMD inside each thread
// ✅ Fastest version for large arrays
// ✅ Perfect for demonstrating hybrid data-parallel + SIMD parallelism
// ❌ Requires arrays to be contiguous
//
let a_slice = a.as_slice().unwrap();
let b_slice = b.as_slice().unwrap();
let res_slice = res.as_slice_mut().unwrap();
type RowChunk<'a> = (&'a mut [f32], &'a [f32]);
res_slice
.par_chunks_mut(COLS)
.zip(a_slice.par_chunks(COLS))
.zip(b_slice.par_chunks(COLS))
.for_each(|((res_row, a_row), b_row): (RowChunk, &[f32])| {
let row_len = a_row.len();
let chunks = row_len / LANES;
// SIMD part
for c in 0..chunks {
let idx = c * LANES;
let va = load8(a_row, idx); // Usage of fn
let vb = load8!(b_row, idx); // Usage of macro
let vr = va + vb;
let result = vr.to_array();
res_row[idx..idx + LANES].copy_from_slice(&result);
}
// Scalar tail
for i in (chunks * LANES)..row_len {
res_row[i] = a_row[i] + b_row[i];
}
});
println!(
"Done 10 (SIMD/Parallel)\t- res[0][0] = {}, time = {} ms",
res_slice[0],
t10.elapsed().as_millis()
);
}