Skip to content

SIMD Internals Deep Dive

This deep dive explores the internal mechanisms of SIMD (Single Instruction Multiple Data) vectorization in modern C++ applications.

SIMD Fundamentals

The Vector Advantage

SIMD processes multiple data elements with a single instruction:

Scalar: 4 operations for 4 floats
┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐
│ a+b │ │ c+d │ │ e+f │ │ g+h │
└─────┘ └─────┘ └─────┘ └─────┘

SIMD (SSE): 1 operation for 4 floats
┌─────────────────────┐
│ [a,c,e,g] + [b,d,f,h] │
└─────────────────────┘

SIMD Register Sizes

ISARegister SizeFloatsDoublesInt32
SSE2128-bit424
AVX256-bit848
AVX-512512-bit16816

Auto-Vectorization Conditions

The compiler can automatically vectorize loops when specific conditions are met.

Good Patterns

cpp
// ✅ Simple, contiguous access
void add_arrays(float* a, float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        c[i] = a[i] + b[i];
    }
}

// ✅ No aliasing (restrict)
void add_arrays_restrict(float* __restrict a, float* __restrict b,
                         float* __restrict c, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        c[i] = a[i] + b[i];
    }
}

// ✅ Known trip count
constexpr size_t N = 1024;
void process_known(float data[N]) {
    for (size_t i = 0; i < N; ++i) {
        data[i] *= 2.0f;
    }
}

Bad Patterns

cpp
// ❌ Data dependency (prefix sum)
void prefix_sum(float* data, size_t n) {
    for (size_t i = 1; i < n; ++i) {
        data[i] += data[i - 1];  // Each iteration depends on previous
    }
}

// ❌ Indirect access (gather)
void gather(float* src, int* indices, float* dst, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        dst[i] = src[indices[i]];  // Non-contiguous access
    }
}

// ❌ Branch inside loop
void conditional(float* data, size_t n, float threshold) {
    for (size_t i = 0; i < n; ++i) {
        if (data[i] > threshold) {  // Branch prevents vectorization
            data[i] *= 2.0f;
        }
    }
}

// ❌ Function call
void with_call(float* data, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        data[i] = std::sqrt(data[i]);  // Unknown function
    }
}

Compiler Hints

cpp
// GCC/Clang: Suggest vectorization
#pragma GCC ivdep
for (size_t i = 0; i < n; ++i) {
    c[i] = a[i] + b[i];
}

// OpenMP SIMD
#pragma omp simd
for (size_t i = 0; i < n; ++i) {
    c[i] = a[i] + b[i];
}

// C++ std::execution (parallel + vectorized)
#include <execution>
std::transform(std::execution::par_unseq,
               a, a + n, b, c,
               [](float x, float y) { return x + y; });

SIMD Wrapper Design

Layered Abstraction

cpp
// simd_wrapper.hpp - Layered SIMD abstraction

namespace hpc::simd {

// === Scalar Fallback ===
struct Scalar {
    static constexpr size_t width = 1;

    static inline float add(float a, float b) { return a + b; }
    static inline float mul(float a, float b) { return a * b; }
    // ...
};

// === SSE2 (128-bit) ===
#ifdef __SSE2__
#include <xmmintrin.h>
#include <emmintrin.h>

struct SSE2 {
    static constexpr size_t width = 4;

    using Vec = __m128;

    static inline Vec add(Vec a, Vec b) { return _mm_add_ps(a, b); }
    static inline Vec mul(Vec a, Vec b) { return _mm_mul_ps(a, b); }
    static inline Vec load(const float* p) { return _mm_loadu_ps(p); }
    static inline void store(float* p, Vec v) { _mm_storeu_ps(p, v); }
};
#endif

// === AVX (256-bit) ===
#ifdef __AVX__
#include <immintrin.h>

struct AVX {
    static constexpr size_t width = 8;

    using Vec = __m256;

    static inline Vec add(Vec a, Vec b) { return _mm256_add_ps(a, b); }
    static inline Vec mul(Vec a, Vec b) { return _mm256_mul_ps(a, b); }
    static inline Vec load(const float* p) { return _mm256_loadu_ps(p); }
    static inline void store(float* p, Vec v) { _mm256_storeu_ps(p, v); }
};
#endif

// === AVX-512 (512-bit) ===
#ifdef __AVX512F__
struct AVX512 {
    static constexpr size_t width = 16;

    using Vec = __m512;

    static inline Vec add(Vec a, Vec b) { return _mm512_add_ps(a, b); }
    static inline Vec mul(Vec a, Vec b) { return _mm512_mul_ps(a, b); }
    static inline Vec load(const float* p) { return _mm512_loadu_ps(p); }
    static inline void store(float* p, Vec v) { _mm512_storeu_ps(p, v); }
};
#endif

} // namespace hpc::simd

Operator Overloading

cpp
// Natural syntax through operator overloading
namespace hpc::simd {

struct Vec4 {
    __m128 data;

    Vec4 operator+(Vec4 other) const {
        return Vec4{_mm_add_ps(data, other.data)};
    }

    Vec4 operator*(Vec4 other) const {
        return Vec4{_mm_mul_ps(data, other.data)};
    }

    static Vec4 load(const float* p) {
        return Vec4{_mm_loadu_ps(p)};
    }

    void store(float* p) const {
        _mm_storeu_ps(p, data);
    }
};

// Usage
void add_arrays(float* a, float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; i += 4) {
        Vec4 va = Vec4::load(a + i);
        Vec4 vb = Vec4::load(b + i);
        Vec4 vc = va + vb;
        vc.store(c + i);
    }
}

} // namespace hpc::simd

Runtime Dispatch

Function Multi-Versioning

cpp
// Generate multiple versions at compile time
namespace {

__attribute__((target("default")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; ++i) {
        c[i] = a[i] + b[i];
    }
}

__attribute__((target("sse2")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; i += 4) {
        __m128 va = _mm_loadu_ps(a + i);
        __m128 vb = _mm_loadu_ps(b + i);
        _mm_storeu_ps(c + i, _mm_add_ps(va, vb));
    }
}

__attribute__((target("avx2")))
void add_arrays_impl(float* a, float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; i += 8) {
        __m256 va = _mm256_loadu_ps(a + i);
        __m256 vb = _mm256_loadu_ps(b + i);
        _mm256_storeu_ps(c + i, _mm256_add_ps(va, vb));
    }
}

} // anonymous namespace

// Public interface - compiler selects best version
inline void add_arrays(float* a, float* b, float* c, size_t n) {
    add_arrays_impl(a, b, c, n);
}

Manual Runtime Dispatch

cpp
#include <cpuid.h>

enum class SIMDCapability {
    Scalar,
    SSE2,
    AVX2,
    AVX512
};

SIMDLevel detect_simd() {
    unsigned int eax, ebx, ecx, edx;

    __get_cpuid(1, &eax, &ebx, &ecx, &edx);

    if (ecx & bit_AVX512F) return SIMDCapability::AVX512;
    if (ecx & bit_AVX2) return SIMDCapability::AVX2;
    if (edx & bit_SSE2) return SIMDCapability::SSE2;

    return SIMDCapability::Scalar;
}

// Function pointer table
using AddFunc = void(*)(float*, float*, float*, size_t);

AddFunc select_add_func() {
    static const AddFunc func = []() -> AddFunc {
        switch (detect_simd()) {
            case SIMDCapability::AVX512: return add_avx512;
            case SIMDCapability::AVX2: return add_avx2;
            case SIMDCapability::SSE2: return add_sse2;
            default: return add_scalar;
        }
    }();
    return func;
}

Masked Operations (AVX-512)

AVX-512 introduces mask registers for predicated execution:

cpp
// AVX-512 masked operation
void conditional_scale(float* data, size_t n, float threshold) {
    const __m512 thresh = _mm512_set1_ps(threshold);
    const __m512 scale = _mm512_set1_ps(2.0f);

    for (size_t i = 0; i < n; i += 16) {
        __m512 v = _mm512_loadu_ps(data + i);

        // Compare creates a mask
        __mmask16 mask = _mm512_cmp_ps_mask(v, thresh, _MM_CMPINT_GT);

        // Only scale elements where mask is 1
        __m512 scaled = _mm512_mul_ps(v, scale);
        __m512 result = _mm512_mask_blend_ps(mask, v, scaled);

        _mm512_storeu_ps(data + i, result);
    }
}

Mask Benefits

  1. No branches: Eliminates branch misprediction
  2. Full vector utilization: All lanes active, just with different operations
  3. Cleaner code: Natural expression of conditional logic

Performance Analysis

Throughput Comparison

OperationScalarSSE2AVX2AVX-512
Add (GFLOPS)2.08.016.032.0
Mul (GFLOPS)2.08.016.032.0
FMA (GFLOPS)N/AN/A32.064.0

Latency vs Throughput

cpp
// High latency operation - hide with independent work
__m256 x = _mm256_div_ps(a, b);  // ~10-15 cycles

// Do independent work while division completes
__m256 y = _mm256_mul_ps(c, d);  // ~5 cycles
__m256 z = _mm256_add_ps(e, f);  // ~3 cycles

// Now use x - latency hidden

Practical Guidelines

When to Use Explicit SIMD

Use explicit SIMD when:

  • Auto-vectorization fails (check compiler reports)
  • Performance is critical and vectorization is key
  • You need masked operations (AVX-512)
  • Algorithm has specific SIMD-friendly structure

Prefer auto-vectorization when:

  • Code simplicity is important
  • Compiler can vectorize effectively
  • Portability across architectures matters
  • Maintenance cost is a concern

Debugging Vectorization

bash
# GCC: Show vectorization reports
g++ -O3 -fopt-info-vec-missed
g++ -O3 -fopt-info-vec-optimized

# Clang: Show optimization remarks
clang++ -O3 -Rpass=loop-vectorize
clang++ -O3 -Rpass-missed=loop-vectorize

# Intel: Detailed vectorization report
icpc -O3 -qopt-report=5 -qopt-report-phase=vec

References

References

  1. Fog, A. (2023). Optimizing software in C++ - Chapter 12: Vectorization. Copenhagen UniversityLink
  2. Intel (2023). Intel Intrinsics Guide. Intel CorporationLink
  3. GCC Team (2023). GCC Auto-vectorization documentation. GCC ManualLink

Released under the MIT License.