Finding OpenMP
OpenMP is a portable shared-memory parallel programming API supported by GCC, Clang, MSVC, and Intel compilers. CMake provides a first-class FindOpenMP module that detects compiler support and creates imported targets:
# CMakeLists.txt — Basic OpenMP integration
cmake_minimum_required(VERSION 3.20)
project(ParallelApp LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
find_package(OpenMP REQUIRED)
add_executable(parallel_app src/main.cpp src/solver.cpp)
target_link_libraries(parallel_app PRIVATE OpenMP::OpenMP_CXX)
# Print detected version
message(STATUS "OpenMP version: ${OpenMP_CXX_VERSION}")
message(STATUS "OpenMP flags: ${OpenMP_CXX_FLAGS}")
The OpenMP::OpenMP_CXX target automatically propagates the correct compiler flags (-fopenmp for GCC/Clang, /openmp for MSVC) and links the appropriate runtime library. For C and Fortran, use OpenMP::OpenMP_C and OpenMP::OpenMP_Fortran respectively.
OpenMP::OpenMP_CXX rather than manually adding ${OpenMP_CXX_FLAGS} to compile options. The target approach correctly separates compile flags from link flags and propagates through transitive dependencies.
Version Detection
Different OpenMP versions provide different features. CMake exposes the detected version allowing conditional feature use:
# Version-dependent feature flags
find_package(OpenMP REQUIRED)
add_executable(solver src/solver.cpp)
target_link_libraries(solver PRIVATE OpenMP::OpenMP_CXX)
# Check OpenMP version for feature availability
if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "4.5")
target_compile_definitions(solver PRIVATE HAS_OMP_SIMD=1)
message(STATUS "OpenMP 4.5+ detected — SIMD directives available")
endif()
if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "5.0")
target_compile_definitions(solver PRIVATE HAS_OMP_LOOP=1)
message(STATUS "OpenMP 5.0+ detected — loop construct available")
endif()
if(OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "5.1")
target_compile_definitions(solver PRIVATE HAS_OMP_DISPATCH=1)
message(STATUS "OpenMP 5.1+ detected — dispatch construct available")
endif()
// src/solver.cpp — Version-conditional OpenMP usage
#include <omp.h>
#include <vector>
#include <numeric>
#include <iostream>
double parallel_sum(const std::vector<double>& data) {
double sum = 0.0;
#ifdef HAS_OMP_LOOP
// OpenMP 5.0 loop construct (better optimization hints)
#pragma omp parallel loop reduction(+:sum)
for (size_t i = 0; i < data.size(); ++i) {
sum += data[i];
}
#else
// OpenMP 3.0 parallel for (universal support)
#pragma omp parallel for reduction(+:sum)
for (size_t i = 0; i < data.size(); ++i) {
sum += data[i];
}
#endif
return sum;
}
int main() {
std::vector<double> data(1000000);
std::iota(data.begin(), data.end(), 1.0);
std::cout << "OpenMP threads: " << omp_get_max_threads() << "\n";
std::cout << "Sum: " << parallel_sum(data) << "\n";
return 0;
}
Per-Target OpenMP
In projects with mixed serial and parallel code, apply OpenMP only to specific targets rather than globally:
# Per-target OpenMP — not all code needs parallelism
cmake_minimum_required(VERSION 3.20)
project(HybridApp LANGUAGES CXX)
find_package(OpenMP REQUIRED)
# Serial library — no OpenMP overhead
add_library(config_lib src/config.cpp src/parser.cpp)
target_include_directories(config_lib PUBLIC include)
# Parallel computation library — uses OpenMP
add_library(compute_lib src/solver.cpp src/matrix.cpp)
target_include_directories(compute_lib PUBLIC include)
target_link_libraries(compute_lib PUBLIC OpenMP::OpenMP_CXX)
# Main application — gets OpenMP transitively from compute_lib
add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE config_lib compute_lib)
# Tests — may want OpenMP disabled for determinism
add_executable(tests tests/test_solver.cpp)
target_link_libraries(tests PRIVATE compute_lib)
# Override thread count for reproducible tests
target_compile_definitions(tests PRIVATE OMP_NUM_THREADS_DEFAULT=1)
SIMD Directives
OpenMP 4.0+ provides SIMD directives for explicit vectorization. CMake integration requires ensuring the compiler supports the requested SIMD level:
# SIMD-aware build configuration
find_package(OpenMP REQUIRED)
add_library(simd_math src/vectorized_math.cpp)
target_link_libraries(simd_math PUBLIC OpenMP::OpenMP_CXX)
# Enable SIMD-friendly compiler optimizations
target_compile_options(simd_math PRIVATE
$<$<CXX_COMPILER_ID:GNU>:-march=native -ftree-vectorize>
$<$<CXX_COMPILER_ID:Clang>:-march=native -fvectorize>
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>
)
// src/vectorized_math.cpp — OpenMP SIMD usage
#include <cmath>
#include <vector>
// Explicit SIMD vectorization
void normalize_array(float* data, int n) {
float norm = 0.0f;
// SIMD reduction
#pragma omp simd reduction(+:norm)
for (int i = 0; i < n; ++i) {
norm += data[i] * data[i];
}
norm = std::sqrt(norm);
// SIMD division
#pragma omp simd
for (int i = 0; i < n; ++i) {
data[i] /= norm;
}
}
// Combined parallel + SIMD
void matrix_multiply(const float* A, const float* B, float* C,
int M, int N, int K) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
float sum = 0.0f;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < K; ++k) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
GPU Offloading
OpenMP 4.5+ supports offloading computation to GPUs via target directives. This requires specific compiler and runtime support:
# GPU offloading with OpenMP (requires compatible compiler)
cmake_minimum_required(VERSION 3.20)
project(GPUApp LANGUAGES CXX)
find_package(OpenMP REQUIRED)
add_executable(gpu_app src/gpu_compute.cpp)
target_link_libraries(gpu_app PRIVATE OpenMP::OpenMP_CXX)
# Clang with NVIDIA GPU offloading
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
target_compile_options(gpu_app PRIVATE
-fopenmp-targets=nvptx64-nvidia-cuda
--cuda-path=/usr/local/cuda
)
target_link_options(gpu_app PRIVATE
-fopenmp-targets=nvptx64-nvidia-cuda
)
endif()
# GCC with AMD GPU offloading
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(gpu_app PRIVATE
-foffload=amdgcn-amdhsa
-foffload-options=amdgcn-amdhsa=-march=gfx906
)
endif()
macOS Workarounds (libomp)
Apple's Clang (Xcode) does not ship with OpenMP support. macOS users must install libomp separately and configure CMake to find it:
# Install OpenMP runtime on macOS
brew install libomp
# The library installs to /opt/homebrew/opt/libomp (Apple Silicon)
# or /usr/local/opt/libomp (Intel Mac)
# macOS OpenMP workaround
cmake_minimum_required(VERSION 3.20)
project(MyApp LANGUAGES CXX)
if(APPLE)
# Help CMake find Homebrew's libomp
execute_process(
COMMAND brew --prefix libomp
OUTPUT_VARIABLE LIBOMP_PREFIX
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(LIBOMP_PREFIX)
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include")
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${LIBOMP_PREFIX}/include")
set(OpenMP_C_LIB_NAMES "omp")
set(OpenMP_CXX_LIB_NAMES "omp")
set(OpenMP_omp_LIBRARY "${LIBOMP_PREFIX}/lib/libomp.dylib")
endif()
endif()
find_package(OpenMP REQUIRED)
add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE OpenMP::OpenMP_CXX)
Conditional OpenMP Support
Making OpenMP optional allows your project to build on systems without OpenMP (single-threaded fallback):
# Optional OpenMP — graceful fallback
cmake_minimum_required(VERSION 3.20)
project(FlexibleApp LANGUAGES CXX)
option(USE_OPENMP "Enable OpenMP parallelism" ON)
if(USE_OPENMP)
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
message(STATUS "OpenMP ${OpenMP_CXX_VERSION} found — parallel builds enabled")
else()
message(WARNING "OpenMP not found — building single-threaded")
endif()
endif()
add_library(compute src/compute.cpp)
target_include_directories(compute PUBLIC include)
if(OpenMP_CXX_FOUND)
target_link_libraries(compute PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(compute PUBLIC USE_OPENMP=1)
else()
target_compile_definitions(compute PUBLIC USE_OPENMP=0)
endif()
add_executable(app src/main.cpp)
target_link_libraries(app PRIVATE compute)
// src/compute.cpp — Conditional OpenMP in source
#include "compute.h"
#if USE_OPENMP
#include <omp.h>
#endif
double parallel_reduce(const double* data, int n) {
double sum = 0.0;
#if USE_OPENMP
#pragma omp parallel for reduction(+:sum)
#endif
for (int i = 0; i < n; ++i) {
sum += data[i];
}
return sum;
}
int get_thread_count() {
#if USE_OPENMP
return omp_get_max_threads();
#else
return 1;
#endif
}
Performance Tips
Correct CMake configuration directly impacts OpenMP performance. These patterns help avoid common performance pitfalls:
# Performance-optimized OpenMP build
find_package(OpenMP REQUIRED)
add_executable(benchmark src/benchmark.cpp)
target_link_libraries(benchmark PRIVATE OpenMP::OpenMP_CXX)
# Optimization level matters for OpenMP
target_compile_options(benchmark PRIVATE
$<$<CONFIG:Release>:
$<$<CXX_COMPILER_ID:GNU>:-O3 -march=native -funroll-loops>
$<$<CXX_COMPILER_ID:Clang>:-O3 -march=native>
$<$<CXX_COMPILER_ID:MSVC>:/O2 /GL>
>
)
# Set default thread count for CTest
set_tests_properties(perf_test PROPERTIES
ENVIRONMENT "OMP_NUM_THREADS=4;OMP_PROC_BIND=close;OMP_PLACES=cores"
)
OMP_PROC_BIND=close and OMP_PLACES=cores in your test environment to pin threads to physical cores. This eliminates NUMA effects and context-switch overhead, often improving performance by 20–40% on multi-socket systems.