simd-intrinsics
SIMD Intrinsics
Purpose
Guide agents through SIMD: reading auto-vectorization output, writing SSE2/AVX2/NEON intrinsics, runtime CPU feature detection, and choosing between compiler auto-vectorization and manual intrinsics.
Triggers
- "How do I check if my loop is being auto-vectorized?"
- "How do I write SSE2/AVX2 intrinsics?"
- "Auto-vectorization failed — how do I fix it?"
- "How do I check for CPU features at runtime?"
- "Should I use intrinsics or let the compiler vectorize?"
- "How do I write NEON intrinsics for ARM?"
Workflow
1. Check auto-vectorization
# GCC: show vectorization info
gcc -O2 -march=native -fopt-info-vec src/hot.c -o hot
# Verbose: show missed + successful
gcc -O2 -march=native -fopt-info-vec-missed -fopt-info-vec-optimized src/hot.c
# Clang: vectorization remarks
clang -O2 -march=native \
-Rpass=loop-vectorize \
-Rpass-missed=loop-vectorize \
-Rpass-analysis=loop-vectorize \
src/hot.c -o hot
# Example missed message:
# hot.c:15:5: remark: loop not vectorized: value that could not be identified as
# reduction is used outside the loop [-Rpass-missed=loop-vectorize]
Common auto-vectorization blockers:
| Blocker | Fix |
|---|---|
| Loop-carried dependency | Restructure to remove dependency |
| Data-dependent exit (early return) | Move exit after loop |
| Non-contiguous memory | Use gather/scatter or restructure |
| Aliasing (pointer may alias) | Add __restrict__ or restrict |
| Unknown trip count | Add __builtin_expect or hint |
| Function call in loop body | Inline the function |
// Help the compiler by adding restrict
void add_arrays(float * __restrict__ dst,
const float * __restrict__ a,
const float * __restrict__ b,
size_t n) {
for (size_t i = 0; i < n; i++)
dst[i] = a[i] + b[i]; // Now vectorizable
}
2. Runtime CPU feature detection
// Linux: use __builtin_cpu_supports (GCC/Clang)
if (__builtin_cpu_supports("avx2")) {
process_avx2(data, len);
} else if (__builtin_cpu_supports("sse4.2")) {
process_sse42(data, len);
} else {
process_scalar(data, len);
}
// Check specific features:
__builtin_cpu_supports("sse2")
__builtin_cpu_supports("sse4.1")
__builtin_cpu_supports("sse4.2")
__builtin_cpu_supports("avx")
__builtin_cpu_supports("avx2")
__builtin_cpu_supports("avx512f")
__builtin_cpu_supports("bmi")
__builtin_cpu_supports("bmi2")
__builtin_cpu_supports("fma")
// Portable: use CPUID directly
#include <cpuid.h>
static int has_avx2(void) {
unsigned int eax, ebx, ecx, edx;
// CPUID leaf 7, subleaf 0
__cpuid_count(7, 0, eax, ebx, ecx, edx);
return (ebx >> 5) & 1; // bit 5 = AVX2
}
3. SSE2 / SSE4.2 intrinsics (x86)
#include <immintrin.h> // All x86 intrinsics
// SSE2: 128-bit vectors
// __m128 = 4 floats
// __m128d = 2 doubles
// __m128i = integers (8x16, 4x32, 2x64, 16x8)
void sum_floats_sse2(float *dst, const float *a, const float *b, int n) {
int i = 0;
for (; i <= n - 4; i += 4) {
__m128 va = _mm_loadu_ps(a + i); // unaligned load
__m128 vb = _mm_loadu_ps(b + i);
__m128 vc = _mm_add_ps(va, vb);
_mm_storeu_ps(dst + i, vc); // unaligned store
}
// Handle remainder
for (; i < n; i++) dst[i] = a[i] + b[i];
}
4. AVX2 intrinsics (x86)
#ifdef __AVX2__
#include <immintrin.h>
// __m256 = 8 floats, __m256d = 4 doubles, __m256i = integers
void sum_floats_avx2(float *dst, const float *a, const float *b, int n) {
int i = 0;
for (; i <= n - 8; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_storeu_ps(dst + i, vc);
}
// SSE2 tail (4 elements)
for (; i <= n - 4; i += 4) {
__m128 va = _mm_loadu_ps(a + i);
__m128 vb = _mm_loadu_ps(b + i);
_mm_storeu_ps(dst + i, _mm_add_ps(va, vb));
}
// Scalar tail
for (; i < n; i++) dst[i] = a[i] + b[i];
}
// Fused multiply-add (FMA) — 1 instruction for a*b+c
void fma_avx2(float *dst, const float *a, const float *b, const float *c, int n) {
for (int i = 0; i <= n - 8; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
__m256 vc = _mm256_loadu_ps(c + i);
_mm256_storeu_ps(dst + i, _mm256_fmadd_ps(va, vb, vc)); // dst = a*b + c
}
}
#endif
Compile with: gcc -O2 -mavx2 -mfma src/simd.c
5. NEON intrinsics (ARM/AArch64)
#include <arm_neon.h>
// float32x4_t = 4 floats (128-bit)
// float32x8_t = 8 floats (ARM SVE — scalable)
// uint8x16_t = 16 bytes
// int32x4_t = 4 int32
void sum_floats_neon(float *dst, const float *a, const float *b, int n) {
int i = 0;
for (; i <= n - 4; i += 4) {
float32x4_t va = vld1q_f32(a + i); // load 4 floats
float32x4_t vb = vld1q_f32(b + i);
float32x4_t vc = vaddq_f32(va, vb); // add
vst1q_f32(dst + i, vc); // store 4 floats
}
for (; i < n; i++) dst[i] = a[i] + b[i];
}
// AArch64 FMA
void fma_neon(float *dst, const float *a, const float *b, const float *c, int n) {
for (int i = 0; i <= n - 4; i += 4) {
float32x4_t va = vld1q_f32(a + i);
float32x4_t vb = vld1q_f32(b + i);
float32x4_t vc = vld1q_f32(c + i);
vst1q_f32(dst + i, vfmaq_f32(vc, va, vb)); // vc + va*vb
}
}
Compile with: gcc -O2 -march=armv8-a+simd src/simd.c
6. Choose auto-vectorization vs intrinsics
Can the compiler auto-vectorize?
→ Try first: add __restrict__, remove complex control flow, align data
→ Check with -fopt-info-vec or -Rpass=loop-vectorize
→ If vectorized: verify correctness and performance
Still need intrinsics?
→ Prefer compiler builtins: __builtin_popcount, __builtin_ctz
→ Use SIMD intrinsics for: hand-tuned shuffles, gather/scatter, horizontal ops
→ Avoid intrinsics for: simple element-wise ops (let compiler do it)
7. Alignment and performance
// Aligned allocation (required for _mm256_load_ps, optional for _mm256_loadu_ps)
float *buf = (float *)aligned_alloc(32, n * sizeof(float));
// 32-byte alignment for AVX2, 64 for AVX-512
// Hint alignment to compiler
float *__attribute__((aligned(32))) buf = ...;
// Use aligned loads when data is aligned (faster)
__m256 v = _mm256_load_ps(aligned_ptr); // requires 32-byte alignment
__m256 v = _mm256_loadu_ps(unaligned_ptr); // any alignment, slightly slower on old CPUs
For Intel Intrinsics Guide reference and NEON lookup tables, see references/intel-intrinsics-guide.md.
Related skills
- Use
skills/compilers/gccfor-march,-msse4.2,-mavx2flags - Use
skills/compilers/clangfor vectorization remarks and auto-vectorization control - Use
skills/profilers/linux-perfto measure SIMD impact with perf stat counters - Use
skills/low-level-programming/assembly-x86for reading SIMD assembly output
More from mohitmishra786/low-level-dev-skills
cmake
CMake build system skill for C/C++ projects. Use when writing or refactoring CMakeLists.txt, configuring out-of-source builds, selecting generators (Ninja, Make, VS), managing targets and dependencies with target_link_libraries, integrating external packages via find_package or FetchContent, enabling sanitizers, setting up toolchain files for cross-compilation, or exporting CMake packages. Activates on queries about CMakeLists.txt, cmake configure errors, target properties, install rules, CPack, or CMake presets.
580static-analysis
Static analysis skill for C/C++ codebases. Use when hardening code quality, triaging noisy builds, running clang-tidy, cppcheck, or scan-build, interpreting check categories, suppressing false positives, or integrating static analysis into CI. Activates on queries about clang-tidy checks, cppcheck, scan-build, compile_commands.json, code hardening, or static analysis warnings.
407llvm
LLVM IR and pass pipeline skill. Use when working directly with LLVM Intermediate Representation (IR), running opt passes, generating IR with llc, inspecting or writing LLVM IR for custom passes, or understanding how the LLVM backend lowers IR to assembly. Activates on queries about LLVM IR, opt, llc, llvm-dis, LLVM passes, IR transformations, or building LLVM-based tools.
361gdb
GDB debugger skill for C/C++ programs. Use when starting a GDB session, setting breakpoints, stepping through code, inspecting variables, debugging crashes, using reverse debugging (record/replay), remote debugging with gdbserver, or loading core dumps. Activates on queries about GDB commands, segfaults, hangs, watchpoints, conditional breakpoints, pretty-printers, Python GDB scripting, or multi-threaded debugging.
153linux-perf
Linux perf profiler skill for CPU performance analysis. Use when collecting sampling profiles with perf record, generating perf report, measuring hardware counters (cache misses, branch mispredicts, IPC), identifying hot functions, or feeding perf data into flamegraph tools. Activates on queries about perf, Linux performance counters, PMU events, off-CPU profiling, perf stat, perf annotate, or sampling-based profiling on Linux.
142core-dumps
Core dump analysis skill for production crash triage. Use when loading core files in GDB or LLDB, enabling core dump generation on Linux/macOS, mapping symbols with debuginfo or debuginfod, or extracting backtraces from crashes without re-running the program. Activates on queries about core files, ulimit, coredumpctl, debuginfod, crash triage, or analyzing segfaults from production binaries.
131