Finding MPI
The Message Passing Interface (MPI) is the standard API for distributed-memory parallel programming. CMake's FindMPI module detects installed MPI implementations (OpenMPI, MPICH, Intel MPI, MS-MPI) and creates imported targets:
# CMakeLists.txt — Basic MPI integration
cmake_minimum_required(VERSION 3.20)
project(DistributedSolver LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
find_package(MPI REQUIRED COMPONENTS CXX)
add_executable(solver src/main.cpp src/distributed_solver.cpp)
target_link_libraries(solver PRIVATE MPI::MPI_CXX)
# Print MPI implementation details
message(STATUS "MPI implementation: ${MPI_CXX_LIBRARY_VERSION_STRING}")
message(STATUS "MPI version: ${MPI_CXX_VERSION}")
message(STATUS "mpiexec: ${MPIEXEC_EXECUTABLE}")
message(STATUS "MPI num procs: ${MPIEXEC_MAX_NUMPROCS}")
# Install MPI on various platforms
# Ubuntu/Debian (OpenMPI)
sudo apt install libopenmpi-dev openmpi-bin
# Ubuntu/Debian (MPICH)
sudo apt install libmpich-dev mpich
# macOS
brew install open-mpi
# or
brew install mpich
# Windows (Microsoft MPI)
# Download from https://www.microsoft.com/en-us/download/details.aspx?id=105289
MPI::MPI_CXX target encapsulates all include paths, compile flags, and link libraries needed. This is far superior to manually using MPI_CXX_INCLUDE_DIRS and MPI_CXX_LIBRARIES variables, which don't propagate correctly through transitive dependencies.
C/C++/Fortran Components
MPI supports multiple language bindings. Request only the components your project needs:
# Multi-language MPI project
cmake_minimum_required(VERSION 3.20)
project(MultiLangMPI LANGUAGES C CXX Fortran)
# Request specific language components
find_package(MPI REQUIRED COMPONENTS C CXX Fortran)
# C library with MPI
add_library(mpi_io_c src/mpi_io.c)
target_link_libraries(mpi_io_c PUBLIC MPI::MPI_C)
# C++ application
add_executable(cpp_solver src/solver.cpp)
target_link_libraries(cpp_solver PRIVATE MPI::MPI_CXX mpi_io_c)
# Fortran numerical kernel
add_library(fortran_kernel src/kernel.f90)
target_link_libraries(fortran_kernel PUBLIC MPI::MPI_Fortran)
# Combined application
add_executable(hybrid_app src/main.cpp)
target_link_libraries(hybrid_app PRIVATE
cpp_solver
fortran_kernel
MPI::MPI_CXX
)
// src/solver.cpp — Basic MPI C++ usage
#include <mpi.h>
#include <iostream>
#include <vector>
#include <numeric>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// Each process computes partial sum
const int local_n = 1000000 / size;
std::vector<double> local_data(local_n);
std::iota(local_data.begin(), local_data.end(),
rank * local_n + 1.0);
double local_sum = 0.0;
for (double val : local_data) local_sum += val;
// Reduce to rank 0
double global_sum = 0.0;
MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE,
MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) {
std::cout << "Global sum: " << global_sum
<< " (computed across " << size << " processes)\n";
}
MPI_Finalize();
return 0;
}
Compiler Wrappers vs CMake Targets
MPI implementations provide compiler wrappers (mpicxx, mpicc, mpif90) that automatically add the necessary flags. However, using these as your CMAKE_CXX_COMPILER is discouraged — prefer imported targets:
# WRONG: Don't use MPI wrappers as the compiler
# cmake -B build -DCMAKE_CXX_COMPILER=mpicxx # Anti-pattern!
# RIGHT: Use standard compiler + MPI::MPI_CXX target
# cmake -B build -DCMAKE_CXX_COMPILER=g++
# The MPI::MPI_CXX target adds all necessary flags
# If you MUST use wrappers (legacy code), detect them properly:
if(MPI_CXX_FOUND)
message(STATUS "MPI C++ compiler: ${MPI_CXX_COMPILER}")
# This is informational — still use MPI::MPI_CXX for linking
endif()
CMAKE_CXX_COMPILER=mpicxx prevents CMake from properly detecting compiler features, breaks toolchain files, and causes issues with IDEs. Always use the base compiler and link against MPI::MPI_CXX instead.
Running MPI Tests with CTest
MPI tests require launching multiple processes via mpiexec. CMake provides variables for configuring this correctly with CTest:
# MPI test registration with CTest
find_package(MPI REQUIRED COMPONENTS CXX)
include(CTest)
add_executable(mpi_test tests/test_communication.cpp)
target_link_libraries(mpi_test PRIVATE MPI::MPI_CXX)
# Basic MPI test (4 processes)
add_test(
NAME mpi_comm_test_4
COMMAND ${MPIEXEC_EXECUTABLE}
${MPIEXEC_NUMPROC_FLAG} 4
${MPIEXEC_PREFLAGS}
$<TARGET_FILE:mpi_test>
${MPIEXEC_POSTFLAGS}
)
# Test with different process counts
foreach(NPROCS 1 2 4 8)
add_test(
NAME mpi_scaling_${NPROCS}
COMMAND ${MPIEXEC_EXECUTABLE}
${MPIEXEC_NUMPROC_FLAG} ${NPROCS}
$<TARGET_FILE:mpi_test>
--gtest_filter=*Scaling*
)
set_tests_properties(mpi_scaling_${NPROCS} PROPERTIES
LABELS "mpi;scaling"
TIMEOUT 120
PROCESSORS ${NPROCS} # Tell CTest how many cores this needs
)
endforeach()
# Oversubscription test (more procs than cores)
add_test(
NAME mpi_oversubscribe
COMMAND ${MPIEXEC_EXECUTABLE}
${MPIEXEC_NUMPROC_FLAG} 16
--oversubscribe # OpenMPI-specific flag
$<TARGET_FILE:mpi_test>
)
# Run MPI tests
cd build
ctest -L mpi --parallel 1 # MPI tests shouldn't run in parallel
ctest -R "mpi_scaling_4" # Run specific test
ctest --timeout 300 -L mpi # Extended timeout for large tests
OpenMPI vs MPICH Differences
The two major open-source MPI implementations have different behaviors that affect CMake configuration:
# Detect MPI implementation for implementation-specific flags
find_package(MPI REQUIRED COMPONENTS CXX)
# Check which implementation we found
if(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Open MPI")
set(MPI_IMPL "openmpi")
message(STATUS "Detected OpenMPI")
elseif(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "MPICH")
set(MPI_IMPL "mpich")
message(STATUS "Detected MPICH")
elseif(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Intel")
set(MPI_IMPL "intelmpi")
message(STATUS "Detected Intel MPI")
endif()
# Implementation-specific test flags
if(MPI_IMPL STREQUAL "openmpi")
# OpenMPI requires --oversubscribe for more procs than cores
set(MPI_TEST_EXTRA_FLAGS "--oversubscribe")
# OpenMPI uses --mca for runtime parameters
list(APPEND MPI_TEST_EXTRA_FLAGS "--mca" "btl" "self,tcp")
elseif(MPI_IMPL STREQUAL "mpich")
# MPICH allows oversubscription by default
set(MPI_TEST_EXTRA_FLAGS "")
endif()
add_test(
NAME distributed_test
COMMAND ${MPIEXEC_EXECUTABLE}
${MPIEXEC_NUMPROC_FLAG} 4
${MPI_TEST_EXTRA_FLAGS}
$<TARGET_FILE:my_mpi_app>
)
Hybrid MPI+OpenMP
Many HPC applications combine MPI (inter-node) with OpenMP (intra-node) for optimal performance. CMake handles both simultaneously:
# Hybrid MPI + OpenMP build
cmake_minimum_required(VERSION 3.20)
project(HybridSolver LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
find_package(MPI REQUIRED COMPONENTS CXX)
find_package(OpenMP REQUIRED)
add_executable(hybrid_solver
src/main.cpp
src/domain_decomposition.cpp
src/local_solver.cpp
)
# Link both MPI and OpenMP
target_link_libraries(hybrid_solver PRIVATE
MPI::MPI_CXX
OpenMP::OpenMP_CXX
)
# Hybrid test: 2 MPI processes × 4 OpenMP threads
add_test(
NAME hybrid_2x4
COMMAND ${MPIEXEC_EXECUTABLE}
${MPIEXEC_NUMPROC_FLAG} 2
$<TARGET_FILE:hybrid_solver>
)
set_tests_properties(hybrid_2x4 PROPERTIES
ENVIRONMENT "OMP_NUM_THREADS=4;OMP_PROC_BIND=close"
PROCESSORS 8 # 2 procs × 4 threads
LABELS "hybrid"
)
// src/main.cpp — Hybrid MPI+OpenMP pattern
#include <mpi.h>
#include <omp.h>
#include <iostream>
#include <vector>
int main(int argc, char** argv) {
// Initialize MPI with thread support
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
if (provided < MPI_THREAD_FUNNELED) {
std::cerr << "MPI does not support required threading level\n";
MPI_Abort(MPI_COMM_WORLD, 1);
}
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
std::cout << "Rank " << rank << "/" << size
<< " using " << omp_get_max_threads() << " threads\n";
// Each MPI rank processes its local domain with OpenMP
const int local_size = 1000000;
std::vector<double> local_data(local_size, rank + 1.0);
double local_sum = 0.0;
#pragma omp parallel for reduction(+:local_sum)
for (int i = 0; i < local_size; ++i) {
local_sum += local_data[i] * local_data[i];
}
// MPI reduction across ranks
double global_sum = 0.0;
MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE,
MPI_SUM, MPI_COMM_WORLD);
if (rank == 0) {
std::cout << "Global result: " << global_sum << "\n";
}
MPI_Finalize();
return 0;
}
Custom MPI Launchers
Some environments require custom launchers instead of the standard mpiexec. CMake supports overriding the launch command:
# Custom MPI launcher (e.g., srun on SLURM clusters)
option(MPI_LAUNCHER "Custom MPI launcher command" "")
if(MPI_LAUNCHER)
set(MPI_RUN_COMMAND ${MPI_LAUNCHER})
set(MPI_NPROC_FLAG "-n")
else()
set(MPI_RUN_COMMAND ${MPIEXEC_EXECUTABLE})
set(MPI_NPROC_FLAG ${MPIEXEC_NUMPROC_FLAG})
endif()
# Register test with custom launcher
add_test(
NAME distributed_solve
COMMAND ${MPI_RUN_COMMAND}
${MPI_NPROC_FLAG} 4
$<TARGET_FILE:solver>
)
# Usage:
# cmake -B build -DMPI_LAUNCHER=srun # SLURM
# cmake -B build -DMPI_LAUNCHER=aprun # Cray
# cmake -B build # Default mpiexec
HPC Cluster Integration
Deploying CMake projects to HPC clusters requires handling module systems, cross-compilation toolchains, and batch schedulers:
# CMake toolchain file for HPC cluster (toolchain-hpc.cmake)
# Usage: cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-hpc.cmake
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR x86_64)
# Compiler loaded via module system
# module load gcc/12.3.0 openmpi/4.1.5
set(CMAKE_C_COMPILER gcc)
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_Fortran_COMPILER gfortran)
# MPI hints (module sets these in environment)
set(MPI_HOME "$ENV{MPI_HOME}" CACHE PATH "MPI installation prefix")
# Optimization for cluster hardware
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=znver3 -mtune=znver3" CACHE STRING "")
# Install prefix on shared filesystem
set(CMAKE_INSTALL_PREFIX "/shared/software/myapp/1.0" CACHE PATH "")
#!/bin/bash
# SLURM job script for running CTest on HPC cluster
#SBATCH --job-name=cmake-tests
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=32
#SBATCH --time=01:00:00
#SBATCH --partition=compute
# Load required modules
module load cmake/3.28 gcc/12.3 openmpi/4.1.5
# Build
cmake -B build -S . \
-DCMAKE_BUILD_TYPE=Release \
-DMPI_LAUNCHER=srun
cmake --build build --parallel 32
# Run tests with SLURM's srun
cd build
ctest --output-on-failure \
--timeout 600 \
-L "mpi" \
--parallel 1 # MPI tests manage their own parallelism
CMakeLists.txt portable while allowing each cluster to have its own configuration. Store toolchain files in a cmake/toolchains/ directory.
-march=native on login nodes if your tests run on compute nodes — use explicit architecture flags matching the compute hardware (e.g., -march=znver3 for AMD EPYC).