Table of Contents

  1. Finding MPI
  2. C/C++/Fortran Components
  3. Compiler Wrappers vs CMake Targets
  4. Running MPI Tests with CTest
  5. OpenMPI vs MPICH Differences
  6. Hybrid MPI+OpenMP
  7. Custom MPI Launchers
  8. HPC Cluster Integration
Back to CMake Mastery Series

MPI

June 4, 2026 Wasil Zafar 10 min read

The definitive guide to integrating MPI with CMake — finding implementations, imported targets, CTest with mpiexec, hybrid MPI+OpenMP builds, and deploying to HPC clusters.

Parallel

Finding MPI

The Message Passing Interface (MPI) is the standard API for distributed-memory parallel programming. CMake's FindMPI module detects installed MPI implementations (OpenMPI, MPICH, Intel MPI, MS-MPI) and creates imported targets:

# CMakeLists.txt — Basic MPI integration
cmake_minimum_required(VERSION 3.20)
project(DistributedSolver LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)

find_package(MPI REQUIRED COMPONENTS CXX)

add_executable(solver src/main.cpp src/distributed_solver.cpp)
target_link_libraries(solver PRIVATE MPI::MPI_CXX)

# Print MPI implementation details
message(STATUS "MPI implementation: ${MPI_CXX_LIBRARY_VERSION_STRING}")
message(STATUS "MPI version: ${MPI_CXX_VERSION}")
message(STATUS "mpiexec: ${MPIEXEC_EXECUTABLE}")
message(STATUS "MPI num procs: ${MPIEXEC_MAX_NUMPROCS}")
# Install MPI on various platforms
# Ubuntu/Debian (OpenMPI)
sudo apt install libopenmpi-dev openmpi-bin

# Ubuntu/Debian (MPICH)
sudo apt install libmpich-dev mpich

# macOS
brew install open-mpi
# or
brew install mpich

# Windows (Microsoft MPI)
# Download from https://www.microsoft.com/en-us/download/details.aspx?id=105289
Key Insight: CMake's MPI::MPI_CXX target encapsulates all include paths, compile flags, and link libraries needed. This is far superior to manually using MPI_CXX_INCLUDE_DIRS and MPI_CXX_LIBRARIES variables, which don't propagate correctly through transitive dependencies.

C/C++/Fortran Components

MPI supports multiple language bindings. Request only the components your project needs:

# Multi-language MPI project
cmake_minimum_required(VERSION 3.20)
project(MultiLangMPI LANGUAGES C CXX Fortran)

# Request specific language components
find_package(MPI REQUIRED COMPONENTS C CXX Fortran)

# C library with MPI
add_library(mpi_io_c src/mpi_io.c)
target_link_libraries(mpi_io_c PUBLIC MPI::MPI_C)

# C++ application
add_executable(cpp_solver src/solver.cpp)
target_link_libraries(cpp_solver PRIVATE MPI::MPI_CXX mpi_io_c)

# Fortran numerical kernel
add_library(fortran_kernel src/kernel.f90)
target_link_libraries(fortran_kernel PUBLIC MPI::MPI_Fortran)

# Combined application
add_executable(hybrid_app src/main.cpp)
target_link_libraries(hybrid_app PRIVATE
    cpp_solver
    fortran_kernel
    MPI::MPI_CXX
)
// src/solver.cpp — Basic MPI C++ usage
#include <mpi.h>
#include <iostream>
#include <vector>
#include <numeric>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // Each process computes partial sum
    const int local_n = 1000000 / size;
    std::vector<double> local_data(local_n);
    std::iota(local_data.begin(), local_data.end(),
              rank * local_n + 1.0);

    double local_sum = 0.0;
    for (double val : local_data) local_sum += val;

    // Reduce to rank 0
    double global_sum = 0.0;
    MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE,
               MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        std::cout << "Global sum: " << global_sum
                  << " (computed across " << size << " processes)\n";
    }

    MPI_Finalize();
    return 0;
}

Compiler Wrappers vs CMake Targets

MPI implementations provide compiler wrappers (mpicxx, mpicc, mpif90) that automatically add the necessary flags. However, using these as your CMAKE_CXX_COMPILER is discouraged — prefer imported targets:

# WRONG: Don't use MPI wrappers as the compiler
# cmake -B build -DCMAKE_CXX_COMPILER=mpicxx  # Anti-pattern!

# RIGHT: Use standard compiler + MPI::MPI_CXX target
# cmake -B build -DCMAKE_CXX_COMPILER=g++
# The MPI::MPI_CXX target adds all necessary flags

# If you MUST use wrappers (legacy code), detect them properly:
if(MPI_CXX_FOUND)
    message(STATUS "MPI C++ compiler: ${MPI_CXX_COMPILER}")
    # This is informational — still use MPI::MPI_CXX for linking
endif()
Pitfall: Setting CMAKE_CXX_COMPILER=mpicxx prevents CMake from properly detecting compiler features, breaks toolchain files, and causes issues with IDEs. Always use the base compiler and link against MPI::MPI_CXX instead.

Running MPI Tests with CTest

MPI tests require launching multiple processes via mpiexec. CMake provides variables for configuring this correctly with CTest:

# MPI test registration with CTest
find_package(MPI REQUIRED COMPONENTS CXX)
include(CTest)

add_executable(mpi_test tests/test_communication.cpp)
target_link_libraries(mpi_test PRIVATE MPI::MPI_CXX)

# Basic MPI test (4 processes)
add_test(
    NAME mpi_comm_test_4
    COMMAND ${MPIEXEC_EXECUTABLE}
            ${MPIEXEC_NUMPROC_FLAG} 4
            ${MPIEXEC_PREFLAGS}
            $<TARGET_FILE:mpi_test>
            ${MPIEXEC_POSTFLAGS}
)

# Test with different process counts
foreach(NPROCS 1 2 4 8)
    add_test(
        NAME mpi_scaling_${NPROCS}
        COMMAND ${MPIEXEC_EXECUTABLE}
                ${MPIEXEC_NUMPROC_FLAG} ${NPROCS}
                $<TARGET_FILE:mpi_test>
                --gtest_filter=*Scaling*
    )
    set_tests_properties(mpi_scaling_${NPROCS} PROPERTIES
        LABELS "mpi;scaling"
        TIMEOUT 120
        PROCESSORS ${NPROCS}  # Tell CTest how many cores this needs
    )
endforeach()

# Oversubscription test (more procs than cores)
add_test(
    NAME mpi_oversubscribe
    COMMAND ${MPIEXEC_EXECUTABLE}
            ${MPIEXEC_NUMPROC_FLAG} 16
            --oversubscribe  # OpenMPI-specific flag
            $<TARGET_FILE:mpi_test>
)
# Run MPI tests
cd build
ctest -L mpi --parallel 1   # MPI tests shouldn't run in parallel
ctest -R "mpi_scaling_4"    # Run specific test
ctest --timeout 300 -L mpi  # Extended timeout for large tests

OpenMPI vs MPICH Differences

The two major open-source MPI implementations have different behaviors that affect CMake configuration:

# Detect MPI implementation for implementation-specific flags
find_package(MPI REQUIRED COMPONENTS CXX)

# Check which implementation we found
if(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Open MPI")
    set(MPI_IMPL "openmpi")
    message(STATUS "Detected OpenMPI")
elseif(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "MPICH")
    set(MPI_IMPL "mpich")
    message(STATUS "Detected MPICH")
elseif(MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Intel")
    set(MPI_IMPL "intelmpi")
    message(STATUS "Detected Intel MPI")
endif()

# Implementation-specific test flags
if(MPI_IMPL STREQUAL "openmpi")
    # OpenMPI requires --oversubscribe for more procs than cores
    set(MPI_TEST_EXTRA_FLAGS "--oversubscribe")
    # OpenMPI uses --mca for runtime parameters
    list(APPEND MPI_TEST_EXTRA_FLAGS "--mca" "btl" "self,tcp")
elseif(MPI_IMPL STREQUAL "mpich")
    # MPICH allows oversubscription by default
    set(MPI_TEST_EXTRA_FLAGS "")
endif()

add_test(
    NAME distributed_test
    COMMAND ${MPIEXEC_EXECUTABLE}
            ${MPIEXEC_NUMPROC_FLAG} 4
            ${MPI_TEST_EXTRA_FLAGS}
            $<TARGET_FILE:my_mpi_app>
)

Hybrid MPI+OpenMP

Many HPC applications combine MPI (inter-node) with OpenMP (intra-node) for optimal performance. CMake handles both simultaneously:

# Hybrid MPI + OpenMP build
cmake_minimum_required(VERSION 3.20)
project(HybridSolver LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)

find_package(MPI REQUIRED COMPONENTS CXX)
find_package(OpenMP REQUIRED)

add_executable(hybrid_solver
    src/main.cpp
    src/domain_decomposition.cpp
    src/local_solver.cpp
)

# Link both MPI and OpenMP
target_link_libraries(hybrid_solver PRIVATE
    MPI::MPI_CXX
    OpenMP::OpenMP_CXX
)

# Hybrid test: 2 MPI processes × 4 OpenMP threads
add_test(
    NAME hybrid_2x4
    COMMAND ${MPIEXEC_EXECUTABLE}
            ${MPIEXEC_NUMPROC_FLAG} 2
            $<TARGET_FILE:hybrid_solver>
)
set_tests_properties(hybrid_2x4 PROPERTIES
    ENVIRONMENT "OMP_NUM_THREADS=4;OMP_PROC_BIND=close"
    PROCESSORS 8  # 2 procs × 4 threads
    LABELS "hybrid"
)
// src/main.cpp — Hybrid MPI+OpenMP pattern
#include <mpi.h>
#include <omp.h>
#include <iostream>
#include <vector>

int main(int argc, char** argv) {
    // Initialize MPI with thread support
    int provided;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);

    if (provided < MPI_THREAD_FUNNELED) {
        std::cerr << "MPI does not support required threading level\n";
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    std::cout << "Rank " << rank << "/" << size
              << " using " << omp_get_max_threads() << " threads\n";

    // Each MPI rank processes its local domain with OpenMP
    const int local_size = 1000000;
    std::vector<double> local_data(local_size, rank + 1.0);

    double local_sum = 0.0;
    #pragma omp parallel for reduction(+:local_sum)
    for (int i = 0; i < local_size; ++i) {
        local_sum += local_data[i] * local_data[i];
    }

    // MPI reduction across ranks
    double global_sum = 0.0;
    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE,
                  MPI_SUM, MPI_COMM_WORLD);

    if (rank == 0) {
        std::cout << "Global result: " << global_sum << "\n";
    }

    MPI_Finalize();
    return 0;
}

Custom MPI Launchers

Some environments require custom launchers instead of the standard mpiexec. CMake supports overriding the launch command:

# Custom MPI launcher (e.g., srun on SLURM clusters)
option(MPI_LAUNCHER "Custom MPI launcher command" "")

if(MPI_LAUNCHER)
    set(MPI_RUN_COMMAND ${MPI_LAUNCHER})
    set(MPI_NPROC_FLAG "-n")
else()
    set(MPI_RUN_COMMAND ${MPIEXEC_EXECUTABLE})
    set(MPI_NPROC_FLAG ${MPIEXEC_NUMPROC_FLAG})
endif()

# Register test with custom launcher
add_test(
    NAME distributed_solve
    COMMAND ${MPI_RUN_COMMAND}
            ${MPI_NPROC_FLAG} 4
            $<TARGET_FILE:solver>
)

# Usage:
# cmake -B build -DMPI_LAUNCHER=srun     # SLURM
# cmake -B build -DMPI_LAUNCHER=aprun    # Cray
# cmake -B build                          # Default mpiexec

HPC Cluster Integration

Deploying CMake projects to HPC clusters requires handling module systems, cross-compilation toolchains, and batch schedulers:

# CMake toolchain file for HPC cluster (toolchain-hpc.cmake)
# Usage: cmake -B build -DCMAKE_TOOLCHAIN_FILE=toolchain-hpc.cmake

set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR x86_64)

# Compiler loaded via module system
# module load gcc/12.3.0 openmpi/4.1.5
set(CMAKE_C_COMPILER gcc)
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_Fortran_COMPILER gfortran)

# MPI hints (module sets these in environment)
set(MPI_HOME "$ENV{MPI_HOME}" CACHE PATH "MPI installation prefix")

# Optimization for cluster hardware
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=znver3 -mtune=znver3" CACHE STRING "")

# Install prefix on shared filesystem
set(CMAKE_INSTALL_PREFIX "/shared/software/myapp/1.0" CACHE PATH "")
#!/bin/bash
# SLURM job script for running CTest on HPC cluster
#SBATCH --job-name=cmake-tests
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=32
#SBATCH --time=01:00:00
#SBATCH --partition=compute

# Load required modules
module load cmake/3.28 gcc/12.3 openmpi/4.1.5

# Build
cmake -B build -S . \
    -DCMAKE_BUILD_TYPE=Release \
    -DMPI_LAUNCHER=srun

cmake --build build --parallel 32

# Run tests with SLURM's srun
cd build
ctest --output-on-failure \
      --timeout 600 \
      -L "mpi" \
      --parallel 1  # MPI tests manage their own parallelism
HPC Best Practice: Use CMake toolchain files to encapsulate cluster-specific settings (compilers, optimization flags, MPI paths). This keeps your CMakeLists.txt portable while allowing each cluster to have its own configuration. Store toolchain files in a cmake/toolchains/ directory.
Pitfall: On HPC clusters, the login node often has different hardware than compute nodes. Never run -march=native on login nodes if your tests run on compute nodes — use explicit architecture flags matching the compute hardware (e.g., -march=znver3 for AMD EPYC).