Table of Contents

  1. Finding OpenMP
  2. Version Detection
  3. Per-Target OpenMP
  4. SIMD Directives
  5. GPU Offloading
  6. macOS Workarounds (libomp)
  7. Conditional OpenMP Support
  8. Performance Tips
Back to CMake Mastery Series

OpenMP

June 4, 2026 Wasil Zafar 8 min read

The complete guide to integrating OpenMP with CMake — finding the runtime, per-target parallelism, version detection, SIMD directives, GPU offloading, and platform-specific workarounds.

Parallel

Finding OpenMP

OpenMP is a portable shared-memory parallel programming API supported by GCC, Clang, MSVC, and Intel compilers. CMake provides a first-class FindOpenMP module that detects compiler support and creates imported targets:

# CMakeLists.txt — Basic OpenMP integration
cmake_minimum_required(VERSION 3.20)
project(ParallelApp LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)

find_package(OpenMP REQUIRED)

add_executable(parallel_app src/main.cpp src/solver.cpp)
target_link_libraries(parallel_app PRIVATE OpenMP::OpenMP_CXX)

# Print detected version
message(STATUS "OpenMP version: ${OpenMP_CXX_VERSION}")
message(STATUS "OpenMP flags: ${OpenMP_CXX_FLAGS}")

The OpenMP::OpenMP_CXX target automatically propagates the correct compiler flags (-fopenmp for GCC/Clang, /openmp for MSVC) and links the appropriate runtime library. For C and Fortran, use OpenMP::OpenMP_C and OpenMP::OpenMP_Fortran respectively.

Key Insight: Always use the imported target OpenMP::OpenMP_CXX rather than manually adding ${OpenMP_CXX_FLAGS} to compile options. The target approach correctly separates compile flags from link flags and propagates through transitive dependencies.

Version Detection

Different OpenMP versions provide different features. CMake exposes the detected version allowing conditional feature use:

# Version-dependent feature flags
find_package(OpenMP REQUIRED)

add_executable(solver src/solver.cpp)
target_link_libraries(solver PRIVATE OpenMP::OpenMP_CXX)

# Check OpenMP version for feature availability
if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "4.5")
    target_compile_definitions(solver PRIVATE HAS_OMP_SIMD=1)
    message(STATUS "OpenMP 4.5+ detected — SIMD directives available")
endif()

if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "5.0")
    target_compile_definitions(solver PRIVATE HAS_OMP_LOOP=1)
    message(STATUS "OpenMP 5.0+ detected — loop construct available")
endif()

if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "5.1")
    target_compile_definitions(solver PRIVATE HAS_OMP_DISPATCH=1)
    message(STATUS "OpenMP 5.1+ detected — dispatch construct available")
endif()
// src/solver.cpp — Version-conditional OpenMP usage
#include <omp.h>
#include <vector>
#include <numeric>
#include <iostream>

double parallel_sum(const std::vector<double>& data) {
    double sum = 0.0;

#ifdef HAS_OMP_LOOP
    // OpenMP 5.0 loop construct (better optimization hints)
    #pragma omp parallel loop reduction(+:sum)
    for (size_t i = 0; i < data.size(); ++i) {
        sum += data[i];
    }
#else
    // OpenMP 3.0 parallel for (universal support)
    #pragma omp parallel for reduction(+:sum)
    for (size_t i = 0; i < data.size(); ++i) {
        sum += data[i];
    }
#endif

    return sum;
}

int main() {
    std::vector<double> data(1000000);
    std::iota(data.begin(), data.end(), 1.0);

    std::cout << "OpenMP threads: " << omp_get_max_threads() << "\n";
    std::cout << "Sum: " << parallel_sum(data) << "\n";
    return 0;
}

Per-Target OpenMP

In projects with mixed serial and parallel code, apply OpenMP only to specific targets rather than globally:

# Per-target OpenMP — not all code needs parallelism
cmake_minimum_required(VERSION 3.20)
project(HybridApp LANGUAGES CXX)

find_package(OpenMP REQUIRED)

# Serial library — no OpenMP overhead
add_library(config_lib src/config.cpp src/parser.cpp)
target_include_directories(config_lib PUBLIC include)

# Parallel computation library — uses OpenMP
add_library(compute_lib src/solver.cpp src/matrix.cpp)
target_include_directories(compute_lib PUBLIC include)
target_link_libraries(compute_lib PUBLIC OpenMP::OpenMP_CXX)

# Main application — gets OpenMP transitively from compute_lib
add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE config_lib compute_lib)

# Tests — may want OpenMP disabled for determinism
add_executable(tests tests/test_solver.cpp)
target_link_libraries(tests PRIVATE compute_lib)
# Override thread count for reproducible tests
target_compile_definitions(tests PRIVATE OMP_NUM_THREADS_DEFAULT=1)

SIMD Directives

OpenMP 4.0+ provides SIMD directives for explicit vectorization. CMake integration requires ensuring the compiler supports the requested SIMD level:

# SIMD-aware build configuration
find_package(OpenMP REQUIRED)

add_library(simd_math src/vectorized_math.cpp)
target_link_libraries(simd_math PUBLIC OpenMP::OpenMP_CXX)

# Enable SIMD-friendly compiler optimizations
target_compile_options(simd_math PRIVATE
    $<$<CXX_COMPILER_ID:GNU>:-march=native -ftree-vectorize>
    $<$<CXX_COMPILER_ID:Clang>:-march=native -fvectorize>
    $<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>
)
// src/vectorized_math.cpp — OpenMP SIMD usage
#include <cmath>
#include <vector>

// Explicit SIMD vectorization
void normalize_array(float* data, int n) {
    float norm = 0.0f;

    // SIMD reduction
    #pragma omp simd reduction(+:norm)
    for (int i = 0; i < n; ++i) {
        norm += data[i] * data[i];
    }

    norm = std::sqrt(norm);

    // SIMD division
    #pragma omp simd
    for (int i = 0; i < n; ++i) {
        data[i] /= norm;
    }
}

// Combined parallel + SIMD
void matrix_multiply(const float* A, const float* B, float* C,
                     int M, int N, int K) {
    #pragma omp parallel for collapse(2)
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            float sum = 0.0f;
            #pragma omp simd reduction(+:sum)
            for (int k = 0; k < K; ++k) {
                sum += A[i * K + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

GPU Offloading

OpenMP 4.5+ supports offloading computation to GPUs via target directives. This requires specific compiler and runtime support:

# GPU offloading with OpenMP (requires compatible compiler)
cmake_minimum_required(VERSION 3.20)
project(GPUApp LANGUAGES CXX)

find_package(OpenMP REQUIRED)

add_executable(gpu_app src/gpu_compute.cpp)
target_link_libraries(gpu_app PRIVATE OpenMP::OpenMP_CXX)

# Clang with NVIDIA GPU offloading
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    target_compile_options(gpu_app PRIVATE
        -fopenmp-targets=nvptx64-nvidia-cuda
        --cuda-path=/usr/local/cuda
    )
    target_link_options(gpu_app PRIVATE
        -fopenmp-targets=nvptx64-nvidia-cuda
    )
endif()

# GCC with AMD GPU offloading
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    target_compile_options(gpu_app PRIVATE
        -foffload=amdgcn-amdhsa
        -foffload-options=amdgcn-amdhsa=-march=gfx906
    )
endif()
Pitfall: GPU offloading support varies dramatically between compilers. GCC supports NVIDIA (via nvptx) and AMD (via GCN), Clang supports NVIDIA and AMD, but MSVC has no GPU offloading support. Always check your target compiler's OpenMP offloading capabilities.

macOS Workarounds (libomp)

Apple's Clang (Xcode) does not ship with OpenMP support. macOS users must install libomp separately and configure CMake to find it:

# Install OpenMP runtime on macOS
brew install libomp

# The library installs to /opt/homebrew/opt/libomp (Apple Silicon)
# or /usr/local/opt/libomp (Intel Mac)
# macOS OpenMP workaround
cmake_minimum_required(VERSION 3.20)
project(MyApp LANGUAGES CXX)

if(APPLE)
    # Help CMake find Homebrew's libomp
    execute_process(
        COMMAND brew --prefix libomp
        OUTPUT_VARIABLE LIBOMP_PREFIX
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    if(LIBOMP_PREFIX)
        set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include")
        set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include")
        set(OpenMP_C_LIB_NAMES "omp")
        set(OpenMP_CXX_LIB_NAMES "omp")
        set(OpenMP_omp_LIBRARY "${LIBOMP_PREFIX}/lib/libomp.dylib")
    endif()
endif()

find_package(OpenMP REQUIRED)

add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE OpenMP::OpenMP_CXX)

Conditional OpenMP Support

Making OpenMP optional allows your project to build on systems without OpenMP (single-threaded fallback):

# Optional OpenMP — graceful fallback
cmake_minimum_required(VERSION 3.20)
project(FlexibleApp LANGUAGES CXX)

option(USE_OPENMP "Enable OpenMP parallelism" ON)

if(USE_OPENMP)
    find_package(OpenMP)
    if(OpenMP_CXX_FOUND)
        message(STATUS "OpenMP ${OpenMP_CXX_VERSION} found — parallel builds enabled")
    else()
        message(WARNING "OpenMP not found — building single-threaded")
    endif()
endif()

add_library(compute src/compute.cpp)
target_include_directories(compute PUBLIC include)

if(OpenMP_CXX_FOUND)
    target_link_libraries(compute PUBLIC OpenMP::OpenMP_CXX)
    target_compile_definitions(compute PUBLIC USE_OPENMP=1)
else()
    target_compile_definitions(compute PUBLIC USE_OPENMP=0)
endif()

add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE compute)
// src/compute.cpp — Conditional OpenMP in source
#include "compute.h"

#if USE_OPENMP
#include <omp.h>
#endif

double parallel_reduce(const double* data, int n) {
    double sum = 0.0;

#if USE_OPENMP
    #pragma omp parallel for reduction(+:sum)
#endif
    for (int i = 0; i < n; ++i) {
        sum += data[i];
    }

    return sum;
}

int get_thread_count() {
#if USE_OPENMP
    return omp_get_max_threads();
#else
    return 1;
#endif
}

Performance Tips

Correct CMake configuration directly impacts OpenMP performance. These patterns help avoid common performance pitfalls:

# Performance-optimized OpenMP build
find_package(OpenMP REQUIRED)

add_executable(benchmark src/benchmark.cpp)
target_link_libraries(benchmark PRIVATE OpenMP::OpenMP_CXX)

# Optimization level matters for OpenMP
target_compile_options(benchmark PRIVATE
    $<$<CONFIG:Release>:
        $<$<CXX_COMPILER_ID:GNU>:-O3 -march=native -funroll-loops>
        $<$<CXX_COMPILER_ID:Clang>:-O3 -march=native>
        $<$<CXX_COMPILER_ID:MSVC>:/O2 /GL>
    >
)

# Set default thread count for CTest
set_tests_properties(perf_test PROPERTIES
    ENVIRONMENT "OMP_NUM_THREADS=4;OMP_PROC_BIND=close;OMP_PLACES=cores"
)
Performance Tip: Set OMP_PROC_BIND=close and OMP_PLACES=cores in your test environment to pin threads to physical cores. This eliminates NUMA effects and context-switch overhead, often improving performance by 20–40% on multi-socket systems.