SIMD Internals Deep Dive
This deep dive explores the internal mechanisms of SIMD (Single Instruction Multiple Data) vectorization in modern C++ applications.
SIMD Fundamentals
The Vector Advantage
SIMD processes multiple data elements with a single instruction:
Scalar: 4 operations for 4 floats
┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐
│ a+b │ │ c+d │ │ e+f │ │ g+h │
└─────┘ └─────┘ └─────┘ └─────┘
SIMD (SSE): 1 operation for 4 floats
┌─────────────────────┐
│ [a,c,e,g] + [b,d,f,h] │
└─────────────────────┘SIMD Register Sizes
| ISA | Register Size | Floats | Doubles | Int32 |
|---|---|---|---|---|
| SSE2 | 128-bit | 4 | 2 | 4 |
| AVX | 256-bit | 8 | 4 | 8 |
| AVX-512 | 512-bit | 16 | 8 | 16 |
Auto-Vectorization Conditions
The compiler can automatically vectorize loops when specific conditions are met.
Good Patterns
cpp
// ✅ Simple, contiguous access
void add_arrays(float* a, float* b, float* c, size_t n) {
for (size_t i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
}
// ✅ No aliasing (restrict)
void add_arrays_restrict(float* __restrict a, float* __restrict b,
float* __restrict c, size_t n) {
for (size_t i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
}
// ✅ Known trip count
constexpr size_t N = 1024;
void process_known(float data[N]) {
for (size_t i = 0; i < N; ++i) {
data[i] *= 2.0f;
}
}Bad Patterns
cpp
// ❌ Data dependency (prefix sum)
void prefix_sum(float* data, size_t n) {
for (size_t i = 1; i < n; ++i) {
data[i] += data[i - 1]; // Each iteration depends on previous
}
}
// ❌ Indirect access (gather)
void gather(float* src, int* indices, float* dst, size_t n) {
for (size_t i = 0; i < n; ++i) {
dst[i] = src[indices[i]]; // Non-contiguous access
}
}
// ❌ Branch inside loop
void conditional(float* data, size_t n, float threshold) {
for (size_t i = 0; i < n; ++i) {
if (data[i] > threshold) { // Branch prevents vectorization
data[i] *= 2.0f;
}
}
}
// ❌ Function call
void with_call(float* data, size_t n) {
for (size_t i = 0; i < n; ++i) {
data[i] = std::sqrt(data[i]); // Unknown function
}
}Compiler Hints
cpp
// GCC/Clang: Suggest vectorization
#pragma GCC ivdep
for (size_t i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
// OpenMP SIMD
#pragma omp simd
for (size_t i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
// C++ std::execution (parallel + vectorized)
#include <execution>
std::transform(std::execution::par_unseq,
a, a + n, b, c,
[](float x, float y) { return x + y; });SIMD Wrapper Design
Layered Abstraction
cpp
// simd_wrapper.hpp - Layered SIMD abstraction
namespace hpc::simd {
// === Scalar Fallback ===
struct Scalar {
static constexpr size_t width = 1;
static inline float add(float a, float b) { return a + b; }
static inline float mul(float a, float b) { return a * b; }
// ...
};
// === SSE2 (128-bit) ===
#ifdef __SSE2__
#include <xmmintrin.h>
#include <emmintrin.h>
struct SSE2 {
static constexpr size_t width = 4;
using Vec = __m128;
static inline Vec add(Vec a, Vec b) { return _mm_add_ps(a, b); }
static inline Vec mul(Vec a, Vec b) { return _mm_mul_ps(a, b); }
static inline Vec load(const float* p) { return _mm_loadu_ps(p); }
static inline void store(float* p, Vec v) { _mm_storeu_ps(p, v); }
};
#endif
// === AVX (256-bit) ===
#ifdef __AVX__
#include <immintrin.h>
struct AVX {
static constexpr size_t width = 8;
using Vec = __m256;
static inline Vec add(Vec a, Vec b) { return _mm256_add_ps(a, b); }
static inline Vec mul(Vec a, Vec b) { return _mm256_mul_ps(a, b); }
static inline Vec load(const float* p) { return _mm256_loadu_ps(p); }
static inline void store(float* p, Vec v) { _mm256_storeu_ps(p, v); }
};
#endif
// === AVX-512 (512-bit) ===
#ifdef __AVX512F__
struct AVX512 {
static constexpr size_t width = 16;
using Vec = __m512;
static inline Vec add(Vec a, Vec b) { return _mm512_add_ps(a, b); }
static inline Vec mul(Vec a, Vec b) { return _mm512_mul_ps(a, b); }
static inline Vec load(const float* p) { return _mm512_loadu_ps(p); }
static inline void store(float* p, Vec v) { _mm512_storeu_ps(p, v); }
};
#endif
} // namespace hpc::simdOperator Overloading
cpp
// Natural syntax through operator overloading
namespace hpc::simd {
struct Vec4 {
__m128 data;
Vec4 operator+(Vec4 other) const {
return Vec4{_mm_add_ps(data, other.data)};
}
Vec4 operator*(Vec4 other) const {
return Vec4{_mm_mul_ps(data, other.data)};
}
static Vec4 load(const float* p) {
return Vec4{_mm_loadu_ps(p)};
}
void store(float* p) const {
_mm_storeu_ps(p, data);
}
};
// Usage
void add_arrays(float* a, float* b, float* c, size_t n) {
for (size_t i = 0; i < n; i += 4) {
Vec4 va = Vec4::load(a + i);
Vec4 vb = Vec4::load(b + i);
Vec4 vc = va + vb;
vc.store(c + i);
}
}
} // namespace hpc::simdRuntime Dispatch
Function Multi-Versioning
cpp
// Generate multiple versions at compile time
namespace {
__attribute__((target("default")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
for (size_t i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
}
__attribute__((target("sse2")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
for (size_t i = 0; i < n; i += 4) {
__m128 va = _mm_loadu_ps(a + i);
__m128 vb = _mm_loadu_ps(b + i);
_mm_storeu_ps(c + i, _mm_add_ps(va, vb));
}
}
__attribute__((target("avx2")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
for (size_t i = 0; i < n; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
_mm256_storeu_ps(c + i, _mm256_add_ps(va, vb));
}
}
} // anonymous namespace
// Public interface - compiler selects best version
inline void add_arrays(float* a, float* b, float* c, size_t n) {
add_arrays_impl(a, b, c, n);
}Manual Runtime Dispatch
cpp
#include <cpuid.h>
enum class SIMDCapability {
Scalar,
SSE2,
AVX2,
AVX512
};
SIMDLevel detect_simd() {
unsigned int eax, ebx, ecx, edx;
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
if (ecx & bit_AVX512F) return SIMDCapability::AVX512;
if (ecx & bit_AVX2) return SIMDCapability::AVX2;
if (edx & bit_SSE2) return SIMDCapability::SSE2;
return SIMDCapability::Scalar;
}
// Function pointer table
using AddFunc = void(*)(float*, float*, float*, size_t);
AddFunc select_add_func() {
static const AddFunc func = []() -> AddFunc {
switch (detect_simd()) {
case SIMDCapability::AVX512: return add_avx512;
case SIMDCapability::AVX2: return add_avx2;
case SIMDCapability::SSE2: return add_sse2;
default: return add_scalar;
}
}();
return func;
}Masked Operations (AVX-512)
AVX-512 introduces mask registers for predicated execution:
cpp
// AVX-512 masked operation
void conditional_scale(float* data, size_t n, float threshold) {
const __m512 thresh = _mm512_set1_ps(threshold);
const __m512 scale = _mm512_set1_ps(2.0f);
for (size_t i = 0; i < n; i += 16) {
__m512 v = _mm512_loadu_ps(data + i);
// Compare creates a mask
__mmask16 mask = _mm512_cmp_ps_mask(v, thresh, _MM_CMPINT_GT);
// Only scale elements where mask is 1
__m512 scaled = _mm512_mul_ps(v, scale);
__m512 result = _mm512_mask_blend_ps(mask, v, scaled);
_mm512_storeu_ps(data + i, result);
}
}Mask Benefits
- No branches: Eliminates branch misprediction
- Full vector utilization: All lanes active, just with different operations
- Cleaner code: Natural expression of conditional logic
Performance Analysis
Throughput Comparison
| Operation | Scalar | SSE2 | AVX2 | AVX-512 |
|---|---|---|---|---|
| Add (GFLOPS) | 2.0 | 8.0 | 16.0 | 32.0 |
| Mul (GFLOPS) | 2.0 | 8.0 | 16.0 | 32.0 |
| FMA (GFLOPS) | N/A | N/A | 32.0 | 64.0 |
Latency vs Throughput
cpp
// High latency operation - hide with independent work
__m256 x = _mm256_div_ps(a, b); // ~10-15 cycles
// Do independent work while division completes
__m256 y = _mm256_mul_ps(c, d); // ~5 cycles
__m256 z = _mm256_add_ps(e, f); // ~3 cycles
// Now use x - latency hiddenPractical Guidelines
When to Use Explicit SIMD
✅ Use explicit SIMD when:
- Auto-vectorization fails (check compiler reports)
- Performance is critical and vectorization is key
- You need masked operations (AVX-512)
- Algorithm has specific SIMD-friendly structure
❌ Prefer auto-vectorization when:
- Code simplicity is important
- Compiler can vectorize effectively
- Portability across architectures matters
- Maintenance cost is a concern
Debugging Vectorization
bash
# GCC: Show vectorization reports
g++ -O3 -fopt-info-vec-missed
g++ -O3 -fopt-info-vec-optimized
# Clang: Show optimization remarks
clang++ -O3 -Rpass=loop-vectorize
clang++ -O3 -Rpass-missed=loop-vectorize
# Intel: Detailed vectorization report
icpc -O3 -qopt-report=5 -qopt-report-phase=vec