From 3872fc13f45ea55ca7e27f5d63bccf88bfec7f8d Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Fri, 8 Nov 2024 15:30:29 -0800
Subject: [PATCH] Use AMReX FFT for IGF Solver

This replaces the implementation using HeFFTe. A new runtime parameter
ablastr.do_serial_fft is added. The default is false. If it's true, we use
only one process to do FFT.
---
 .azure-pipelines.yml                          |  12 -
 .github/workflows/cuda.yml                    |  13 +-
 .github/workflows/dependencies/hip.sh         |  13 -
 .github/workflows/hip.yml                     |   6 +-
 CMakeLists.txt                                |  37 ---
 Docs/source/install/cmake.rst                 |   2 -
 Docs/source/install/dependencies.rst          |   3 +-
 .../open_bc_poisson_solver/CMakeLists.txt     |  12 -
 ...puts_test_3d_open_bc_poisson_solver_heffte |   1 -
 GNUmakefile                                   |   1 -
 .../fields/IntegratedGreenFunctionSolver.cpp  | 275 +++---------------
 .../machines/desktop/spack-macos-openmp.yaml  |   1 -
 Tools/machines/desktop/spack-ubuntu-cuda.yaml |   1 -
 .../machines/desktop/spack-ubuntu-openmp.yaml |   1 -
 Tools/machines/desktop/spack-ubuntu-rocm.yaml |   1 -
 .../install_a100_dependencies.sh              |  39 ---
 .../lonestar6_warpx_a100.profile.example      |   2 -
 .../install_cpu_dependencies.sh               |  39 ---
 .../install_gpu_dependencies.sh               |  43 ---
 .../perlmutter_cpu_warpx.profile.example      |   2 -
 .../perlmutter_gpu_warpx.profile.example      |   2 -
 .../tioga-llnl/install_mi300a_dependencies.sh |  42 ---
 .../tioga_mi300a_warpx.profile.example        |   2 -
 cmake/WarpXFunctions.cmake                    |   5 -
 cmake/dependencies/AMReX.cmake                |  15 +-
 setup.py                                      |   2 -
 26 files changed, 54 insertions(+), 518 deletions(-)
 delete mode 100644 Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte

diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
index 62d8a0a424d..d22097a208f 100644
--- a/.azure-pipelines.yml
+++ b/.azure-pipelines.yml
@@ -38,7 +38,6 @@ jobs:
       # Cartesian 3D
       cartesian_3d:
         WARPX_CMAKE_FLAGS: -DWarpX_DIMS=3 -DWarpX_FFT=ON -DWarpX_PYTHON=ON
-        WARPX_HEFFTE: 'TRUE'
       # Cylindrical RZ
       cylindrical_rz:
         WARPX_CMAKE_FLAGS: -DWarpX_DIMS=RZ -DWarpX_FFT=ON -DWarpX_PYTHON=ON
@@ -121,17 +120,6 @@ jobs:
           -DCMAKE_CXX_STANDARD=17                         \
           -Duse_cmake_find_lapack=ON -Dbuild_tests=OFF -DCMAKE_VERBOSE_MAKEFILE=ON
       fi
-      if [ "${WARPX_HEFFTE:-FALSE}" == "TRUE" ]; then
-        cmake-easyinstall --prefix=/usr/local git+https://github.com/icl-utk-edu/heffte.git@v2.4.0 \
-          -DCMAKE_CXX_COMPILER_LAUNCHER=$(which ccache)          \
-          -DCMAKE_CXX_STANDARD=17 -DHeffte_ENABLE_DOXYGEN=OFF    \
-          -DHeffte_ENABLE_FFTW=ON -DHeffte_ENABLE_TESTING=OFF    \
-          -DHeffte_ENABLE_CUDA=OFF -DHeffte_ENABLE_ROCM=OFF      \
-          -DHeffte_ENABLE_ONEAPI=OFF -DHeffte_ENABLE_MKL=OFF     \
-          -DHeffte_ENABLE_PYTHON=OFF -DHeffte_ENABLE_FORTRAN=OFF \
-          -DHeffte_ENABLE_MAGMA=OFF                              \
-          -DCMAKE_VERBOSE_MAKEFILE=ON
-      fi
       # Python modules required for test analysis
       python3 -m pip install --upgrade -r Regression/requirements.txt
       python3 -m pip cache purge
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index a10306789cb..b8ff4804e96 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -62,16 +62,6 @@ jobs:
           -DBUILD_CLI_TOOLS=OFF       \
           -DCMAKE_CXX_COMPILER_LAUNCHER=$(which ccache) \
           -DCMAKE_VERBOSE_MAKEFILE=ON
-        cmake-easyinstall --prefix=/usr/local                    \
-          git+https://github.com/icl-utk-edu/heffte.git@v2.4.0   \
-          -DCMAKE_CXX_COMPILER_LAUNCHER=$(which ccache)          \
-          -DCMAKE_CXX_STANDARD=17 -DHeffte_ENABLE_DOXYGEN=OFF    \
-          -DHeffte_ENABLE_FFTW=OFF -DHeffte_ENABLE_TESTING=OFF   \
-          -DHeffte_ENABLE_CUDA=ON -DHeffte_ENABLE_ROCM=OFF       \
-          -DHeffte_ENABLE_ONEAPI=OFF -DHeffte_ENABLE_MKL=OFF     \
-          -DHeffte_ENABLE_PYTHON=OFF -DHeffte_ENABLE_FORTRAN=OFF \
-          -DHeffte_ENABLE_MAGMA=OFF                              \
-          -DCMAKE_VERBOSE_MAKEFILE=ON
     - name: build WarpX
       run: |
         export CCACHE_COMPRESS=1
@@ -92,7 +82,6 @@ jobs:
           -DWarpX_openpmd_internal=OFF \
           -DWarpX_PRECISION=SINGLE     \
           -DWarpX_FFT=ON               \
-          -DWarpX_HEFFTE=ON            \
           -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON \
           -DAMReX_CUDA_ERROR_CAPTURE_THIS=ON
         cmake --build build_sp -j 4
@@ -137,7 +126,7 @@ jobs:
         which nvcc || echo "nvcc not in PATH!"
 
         git clone https://github.com/AMReX-Codes/amrex.git ../amrex
-        cd ../amrex && git checkout --detach 4b703fec6c2ff983e465c8cef0cc4947231edb07 && cd -
+        cd ../amrex && git checkout --detach 294b6fee6f0c7f44693eac14e6b0c0702ecfd791 && cd -
         make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_FFT=TRUE USE_CCACHE=TRUE -j 4
 
         ccache -s
diff --git a/.github/workflows/dependencies/hip.sh b/.github/workflows/dependencies/hip.sh
index 2a1b4d090bc..1154bb05e58 100755
--- a/.github/workflows/dependencies/hip.sh
+++ b/.github/workflows/dependencies/hip.sh
@@ -79,16 +79,3 @@ sudo curl -L -o /usr/local/bin/cmake-easyinstall https://raw.githubusercontent.c
 sudo chmod a+x /usr/local/bin/cmake-easyinstall
 export CEI_SUDO="sudo"
 export CEI_TMP="/tmp/cei"
-
-# heFFTe
-#
-cmake-easyinstall --prefix=/usr/local                      \
-    git+https://github.com/icl-utk-edu/heffte.git@v2.4.0   \
-    -DCMAKE_CXX_COMPILER_LAUNCHER=$(which ccache)          \
-    -DCMAKE_CXX_STANDARD=17 -DHeffte_ENABLE_DOXYGEN=OFF    \
-    -DHeffte_ENABLE_FFTW=OFF -DHeffte_ENABLE_TESTING=OFF   \
-    -DHeffte_ENABLE_CUDA=OFF -DHeffte_ENABLE_ROCM=ON       \
-    -DHeffte_ENABLE_ONEAPI=OFF -DHeffte_ENABLE_MKL=OFF     \
-    -DHeffte_ENABLE_PYTHON=OFF -DHeffte_ENABLE_FORTRAN=OFF \
-    -DHeffte_ENABLE_MAGMA=OFF                              \
-    -DCMAKE_VERBOSE_MAKEFILE=ON
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 8ba39de7742..6ab4e4a8401 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -61,8 +61,7 @@ jobs:
           -DWarpX_MPI=ON              \
           -DWarpX_OPENPMD=ON          \
           -DWarpX_PRECISION=SINGLE    \
-          -DWarpX_FFT=ON              \
-          -DWarpX_HEFFTE=ON
+          -DWarpX_FFT=ON
         cmake --build build_sp -j 4
 
         export WARPX_MPI=OFF
@@ -122,8 +121,7 @@ jobs:
           -DWarpX_MPI=ON              \
           -DWarpX_OPENPMD=ON          \
           -DWarpX_PRECISION=DOUBLE    \
-          -DWarpX_FFT=ON              \
-          -DWarpX_HEFFTE=ON
+          -DWarpX_FFT=ON
         cmake --build build_2d -j 4
 
         export WARPX_MPI=OFF
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ff14bacfa6..da62c943e19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,6 @@ option(WarpX_LIB           "Build WarpX as a library"                   OFF)
 option(WarpX_MPI           "Multi-node support (message-passing)"       ON)
 option(WarpX_OPENPMD       "openPMD I/O (HDF5, ADIOS)"                  ON)
 option(WarpX_FFT           "FFT-based solvers"                          OFF)
-option(WarpX_HEFFTE        "Multi-node FFT-based solvers"               OFF)
 option(WarpX_PYTHON        "Python bindings"                            OFF)
 option(WarpX_SENSEI        "SENSEI in situ diagnostics"                 OFF)
 option(WarpX_QED           "QED support (requires PICSAR)"              ON)
@@ -146,10 +145,6 @@ mark_as_advanced(WarpX_MPI_THREAD_MULTIPLE)
 
 option(WarpX_amrex_internal                    "Download & build AMReX" ON)
 
-if(WarpX_HEFFTE AND NOT WarpX_MPI)
-    message(FATAL_ERROR "WarpX_HEFFTE (${WarpX_HEFFTE}) can only be used if WarpX_MPI is ON.")
-endif()
-
 # change the default build type to Release (or RelWithDebInfo) instead of Debug
 set_default_build_type("Release")
 
@@ -197,10 +192,6 @@ option(ABLASTR_FFT "compile AnyFFT wrappers" ${WarpX_FFT})
 if(WarpX_FFT)
     set(ABLASTR_FFT ON CACHE STRING "FFT-based solvers" FORCE)
 endif()
-option(ABLASTR_HEFFTE "compile AnyFFT wrappers" ${WarpX_HEFFTE})
-if(WarpX_HEFFTE)
-    set(ABLASTR_HEFFTE ON CACHE STRING "Multi-Node FFT-based solvers" FORCE)
-endif()
 
 # this defined the variable BUILD_TESTING which is ON by default
 include(CTest)
@@ -242,23 +233,6 @@ if(WarpX_FFT)
     endif()
 endif()
 
-# multi-node FFT
-if(WarpX_HEFFTE)
-    if(WarpX_COMPUTE STREQUAL CUDA)
-        set(_heFFTe_COMPS CUDA)
-    elseif(WarpX_COMPUTE STREQUAL HIP)
-        set(_heFFTe_COMPS ROCM)
-    elseif(WarpX_COMPUTE STREQUAL SYCL)
-        set(_heFFTe_COMPS ONEAPI)
-    else()  # NOACC, OMP
-        set(_heFFTe_COMPS FFTW)  # or MKL
-    endif()
-    # note: we could also enforce GPUAWARE for CUDA and HIP, which can still be
-    #       disabled at runtime
-
-    find_package(Heffte REQUIRED COMPONENTS ${_heFFTe_COMPS})
-endif()
-
 # Python
 if(WarpX_PYTHON)
     find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
@@ -499,10 +473,6 @@ foreach(D IN LISTS WarpX_DIMS)
         endif()
     endif()
 
-    if(ABLASTR_HEFFTE)
-        target_link_libraries(ablastr_${SD} PUBLIC Heffte::Heffte)
-    endif()
-
     if(WarpX_PYTHON)
         target_link_libraries(pyWarpX_${SD} PRIVATE pybind11::module pybind11::windows_extras)
         if(WarpX_PYTHON_IPO)
@@ -593,13 +563,6 @@ foreach(D IN LISTS WarpX_DIMS)
         target_compile_definitions(ablastr_${SD} PUBLIC ABLASTR_USE_FFT)
     endif()
 
-    if(WarpX_HEFFTE)
-        target_compile_definitions(ablastr_${SD} PUBLIC WARPX_USE_HEFFTE)
-    endif()
-    if(ABLASTR_HEFFTE)
-        target_compile_definitions(ablastr_${SD} PUBLIC ABLASTR_USE_HEFFTE)
-    endif()
-
     if(WarpX_PYTHON AND pyWarpX_VERSION_INFO)
         # for module __version__
         target_compile_definitions(pyWarpX_${SD} PRIVATE
diff --git a/Docs/source/install/cmake.rst b/Docs/source/install/cmake.rst
index 41e4c40bc85..f3f881d4504 100644
--- a/Docs/source/install/cmake.rst
+++ b/Docs/source/install/cmake.rst
@@ -97,7 +97,6 @@ CMake Option                  Default & Values                             Descr
 ``WarpX_PRECISION``           SINGLE/**DOUBLE**                            Floating point precision (single/double)
 ``WarpX_PARTICLE_PRECISION``  SINGLE/**DOUBLE**                            Particle floating point precision (single/double), defaults to WarpX_PRECISION value if not set
 ``WarpX_FFT``                 ON/**OFF**                                   FFT-based solvers
-``WarpX_HEFFTE``              ON/**OFF**                                   Multi-Node FFT-based solvers
 ``WarpX_PYTHON``              ON/**OFF**                                   Python bindings
 ``WarpX_QED``                 **ON**/OFF                                   QED support (requires PICSAR)
 ``WarpX_QED_TABLE_GEN``       ON/**OFF**                                   QED table generation support (requires PICSAR and Boost)
@@ -275,7 +274,6 @@ Environment Variable          Default & Values                             Descr
 ``WARPX_PRECISION``           SINGLE/**DOUBLE**                            Floating point precision (single/double)
 ``WARPX_PARTICLE_PRECISION``  SINGLE/**DOUBLE**                            Particle floating point precision (single/double), defaults to WarpX_PRECISION value if not set
 ``WARPX_FFT``                 ON/**OFF**                                   FFT-based solvers
-``WARPX_HEFFTE``              ON/**OFF**                                   Multi-Node FFT-based solvers
 ``WARPX_QED``                 **ON**/OFF                                   PICSAR QED (requires PICSAR)
 ``WARPX_QED_TABLE_GEN``       ON/**OFF**                                   QED table generation (requires PICSAR and Boost)
 ``BUILD_PARALLEL``            ``2``                                        Number of threads to use for parallel builds
diff --git a/Docs/source/install/dependencies.rst b/Docs/source/install/dependencies.rst
index 71a607eae6a..13e2377d568 100644
--- a/Docs/source/install/dependencies.rst
+++ b/Docs/source/install/dependencies.rst
@@ -28,7 +28,6 @@ Optional dependencies include:
 - `FFTW3 <http://www.fftw.org>`__: for spectral solver (PSATD or IGF) support when running on CPU or SYCL
 
   - also needs the ``pkg-config`` tool on Unix
-- `heFFTe 2.4.0+ <https://github.com/icl-utk-edu/heffte>`__: for multi-node spectral solver (IGF) support
 - `BLAS++ <https://github.com/icl-utk-edu/blaspp>`__ and `LAPACK++ <https://github.com/icl-utk-edu/lapackpp>`__: for spectral solver (PSATD) support in RZ geometry
 - `Boost 1.66.0+ <https://www.boost.org/>`__: for QED lookup tables generation support
 - `openPMD-api 0.15.1+ <https://github.com/openPMD/openPMD-api>`__: we automatically download and compile a copy of openPMD-api for openPMD I/O support
@@ -81,7 +80,7 @@ Conda (Linux/macOS/Windows)
 
       .. code-block:: bash
 
-         conda create -n warpx-cpu-mpich-dev -c conda-forge blaspp boost ccache cmake compilers git "heffte=*=mpi_mpich*" lapackpp "openpmd-api=*=mpi_mpich*" openpmd-viewer python make numpy pandas scipy yt "fftw=*=mpi_mpich*" pkg-config matplotlib mamba mpich mpi4py ninja pip virtualenv
+         conda create -n warpx-cpu-mpich-dev -c conda-forge blaspp boost ccache cmake compilers git lapackpp "openpmd-api=*=mpi_mpich*" openpmd-viewer python make numpy pandas scipy yt "fftw=*=mpi_mpich*" pkg-config matplotlib mamba mpich mpi4py ninja pip virtualenv
          conda activate warpx-cpu-mpich-dev
 
          # compile WarpX with -DWarpX_MPI=ON
diff --git a/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt b/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
index d6141f0b4ab..c5ec4583da1 100644
--- a/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
+++ b/Examples/Tests/open_bc_poisson_solver/CMakeLists.txt
@@ -12,15 +12,3 @@ if(WarpX_FFT)
         OFF  # dependency
     )
 endif()
-
-if(WarpX_HEFFTE)
-    add_warpx_test(
-        test_3d_open_bc_poisson_solver_heffte # name
-        3  # dims
-        2  # nprocs
-        inputs_test_3d_open_bc_poisson_solver_heffte  # inputs
-        analysis.py  # analysis
-        diags/diag1000001  # output
-        OFF  # dependency
-    )
-endif()
diff --git a/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte b/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte
deleted file mode 100644
index 4f0a50df037..00000000000
--- a/Examples/Tests/open_bc_poisson_solver/inputs_test_3d_open_bc_poisson_solver_heffte
+++ /dev/null
@@ -1 +0,0 @@
-FILE = inputs_test_3d_open_bc_poisson_solver
diff --git a/GNUmakefile b/GNUmakefile
index 1cc78403c7b..6298dd83369 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -38,7 +38,6 @@ USE_OPENPMD = FALSE
 WarpxBinDir = Bin
 
 USE_FFT = FALSE
-USE_HEFFTE = FALSE
 USE_RZ = FALSE
 
 USE_EB = FALSE
diff --git a/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp b/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
index 546326d7fe0..a3ff69d273d 100644
--- a/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
+++ b/Source/ablastr/fields/IntegratedGreenFunctionSolver.cpp
@@ -8,7 +8,6 @@
 
 #include <ablastr/constant.H>
 #include <ablastr/warn_manager/WarnManager.H>
-#include <ablastr/math/fft/AnyFFT.H>
 
 #include <AMReX_Array4.H>
 #include <AMReX_BaseFab.H>
@@ -18,6 +17,7 @@
 #include <AMReX_Config.H>
 #include <AMReX_DistributionMapping.H>
 #include <AMReX_FabArray.H>
+#include <AMReX_FFT.H>
 #include <AMReX_GpuControl.H>
 #include <AMReX_GpuLaunch.H>
 #include <AMReX_GpuQualifiers.H>
@@ -25,13 +25,9 @@
 #include <AMReX_MFIter.H>
 #include <AMReX_MLLinOp.H>
 #include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
 #include <AMReX_REAL.H>
 
-#if defined(ABLASTR_USE_FFT) && defined(ABLASTR_USE_HEFFTE)
-#include <heffte.h>
-#endif
-
-
 namespace ablastr::fields {
 
 void
@@ -42,10 +38,6 @@ computePhiIGF ( amrex::MultiFab const & rho,
 {
     using namespace amrex::literals;
 
-    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: FFTs", timer_ffts);
-    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: FFT plans", timer_plans);
-    BL_PROFILE_VAR_NS("ablastr::fields::computePhiIGF: parallel copies", timer_pcopies);
-
     BL_PROFILE("ablastr::fields::computePhiIGF");
 
     // Define box that encompasses the full domain
@@ -53,240 +45,47 @@ computePhiIGF ( amrex::MultiFab const & rho,
     domain.surroundingNodes(); // get nodal points, since `phi` and `rho` are nodal
     domain.grow( phi.nGrowVect() ); // include guard cells
 
-    int const nx = domain.length(0);
-    int const ny = domain.length(1);
-    int const nz = domain.length(2);
-
-    // Allocate 2x wider arrays for the convolution of rho with the Green function
-    amrex::Box const realspace_box = amrex::Box(
-        {domain.smallEnd(0), domain.smallEnd(1), domain.smallEnd(2)},
-        {2*nx-1+domain.smallEnd(0), 2*ny-1+domain.smallEnd(1), 2*nz-1+domain.smallEnd(2)},
-        amrex::IntVect::TheNodeVector() );
+    // Do we grow the domain in the z-direction in the 2D mode?
+    bool do_2d_fft = false;
 
-#if !defined(ABLASTR_USE_HEFFTE)
-    // Without distributed FFTs (i.e. without heFFTe):
-    // allocate the 2x wider array on a single box
-    amrex::BoxArray const realspace_ba = amrex::BoxArray( realspace_box );
-    // Define a distribution mapping for the global FFT, with only one box
-    amrex::DistributionMapping dm_global_fft;
-    dm_global_fft.define( realspace_ba );
-#elif defined(ABLASTR_USE_HEFFTE)
-    // With distributed FFTs (i.e. with heFFTe):
-    // Define a new distribution mapping which is decomposed purely along z
-    // and has one box per MPI rank
-    int const nprocs = amrex::ParallelDescriptor::NProcs();
-    amrex::BoxArray realspace_ba;
-    amrex::DistributionMapping dm_global_fft;
+    // Specify the number of processes for FFT. Can be any posistive number
+    // including 1.
+    int nprocs = amrex::ParallelDescriptor::NProcs();
     {
-        int realspace_nx = realspace_box.length(0);
-        int realspace_ny = realspace_box.length(1);
-        int realspace_nz = realspace_box.length(2);
-        int minsize_z = realspace_nz / nprocs;
-        int nleft_z = realspace_nz - minsize_z*nprocs;
-
-        AMREX_ALWAYS_ASSERT(realspace_nz >= nprocs);
-        // We are going to split realspace_box in such a way that the first
-        // nleft boxes has minsize_z+1 nodes and the others minsize
-        // nodes. We do it this way instead of BoxArray::maxSize to make
-        // sure there are exactly nprocs boxes and there are no overlaps.
-        amrex::BoxList bl(amrex::IndexType::TheNodeType());
-        for (int iproc = 0; iproc < nprocs; ++iproc) {
-            int zlo, zhi;
-            if (iproc < nleft_z) {
-                zlo = iproc*(minsize_z+1);
-                zhi = zlo + minsize_z;
-
-            } else {
-                zlo = iproc*minsize_z + nleft_z;
-                zhi = zlo + minsize_z - 1;
-
-            }
-            amrex::Box tbx(amrex::IntVect(0,0,zlo),amrex::IntVect(realspace_nx-1,realspace_ny-1,zhi),amrex::IntVect(1));
-
-            tbx.shift(realspace_box.smallEnd());
-            bl.push_back(tbx);
-        }
-        realspace_ba.define(std::move(bl));
-        amrex::Vector<int> pmap(nprocs);
-        std::iota(pmap.begin(), pmap.end(), 0);
-        dm_global_fft.define(std::move(pmap));
+        amrex::ParmParse pp("ablastr");
+        bool do_serial_fft = false;
+        pp.query("do_serial_fft", do_serial_fft);
+        if (do_serial_fft) { nprocs = 1; };
     }
-#endif
-
-    // Allocate required arrays
-    amrex::MultiFab tmp_rho = amrex::MultiFab(realspace_ba, dm_global_fft, 1, 0);
-    tmp_rho.setVal(0);
-    amrex::MultiFab tmp_G = amrex::MultiFab(realspace_ba, dm_global_fft, 1, 0);
-    tmp_G.setVal(0);
-
-    BL_PROFILE_VAR_START(timer_pcopies);
-    // Copy from rho to tmp_rho
-    tmp_rho.ParallelCopy( rho, 0, 0, 1, amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector() );
-    BL_PROFILE_VAR_STOP(timer_pcopies);
-
-#if !defined(ABLASTR_USE_HEFFTE)
-    // Without distributed FFTs (i.e. without heFFTe):
-    // We loop over the original box (not the 2x wider one), and the other quadrants by periodicity
-    amrex::BoxArray const& igf_compute_box = amrex::BoxArray( domain );
-#else
-    // With distributed FFTs (i.e. with heFFTe):
-    // We loop over the full 2x wider box, since 1 MPI rank does not necessarily own the data for the other quadrants
-    amrex::BoxArray const& igf_compute_box = tmp_G.boxArray();
-#endif
-
-    // Compute the integrated Green function
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
-#endif
-    for (amrex::MFIter mfi(igf_compute_box, dm_global_fft, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
-
-        amrex::Box const bx = mfi.tilebox();
 
-        amrex::IntVect const lo = realspace_box.smallEnd();
-        amrex::IntVect const hi = realspace_box.bigEnd();
-
-        // Fill values of the Green function
-        amrex::Real const dx = cell_size[0];
-        amrex::Real const dy = cell_size[1];
-        amrex::Real const dz = cell_size[2];
-
-        amrex::Array4<amrex::Real> const tmp_G_arr = tmp_G.array(mfi);
-        amrex::ParallelFor( bx,
-            [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
-            {
-                int const i0 = i - lo[0];
-                int const j0 = j - lo[1];
-                int const k0 = k - lo[2];
-                amrex::Real const x = i0*dx;
-                amrex::Real const y = j0*dy;
-                amrex::Real const z = k0*dz;
-
-#if !defined(ABLASTR_USE_HEFFTE)
-                // Without distributed FFTs (i.e. without heFFTe):
-                amrex::Real const G_value = SumOfIntegratedPotential(x     , y     , z     , dx, dy, dz);
-                tmp_G_arr(i,j,k) = G_value;
-                // Fill the rest of the array by periodicity
-                if (i0>0) {tmp_G_arr(hi[0]+1-i0, j         , k         ) = G_value;}
-                if (j0>0) {tmp_G_arr(i         , hi[1]+1-j0, k         ) = G_value;}
-                if (k0>0) {tmp_G_arr(i         , j         , hi[2]+1-k0) = G_value;}
-                if ((i0>0)&&(j0>0)) {tmp_G_arr(hi[0]+1-i0, hi[1]+1-j0, k         ) = G_value;}
-                if ((j0>0)&&(k0>0)) {tmp_G_arr(i         , hi[1]+1-j0, hi[2]+1-k0) = G_value;}
-                if ((i0>0)&&(k0>0)) {tmp_G_arr(hi[0]+1-i0, j         , hi[2]+1-k0) = G_value;}
-                if ((i0>0)&&(j0>0)&&(k0>0)) {tmp_G_arr(hi[0]+1-i0, hi[1]+1-j0, hi[2]+1-k0) = G_value;}
-#else
-                // With distributed FFTs (i.e. with heFFTe):
-                amrex::Real x_hi = dx*(hi[0]+2);
-                amrex::Real y_hi = dy*(hi[1]+2);
-                amrex::Real z_hi = dz*(hi[2]+2);
-                if ((i0< nx)&&(j0< ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y     , z     , dx, dy, dz); }
-                if ((i0< nx)&&(j0> ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y_hi-y, z     , dx, dy, dz); }
-                if ((i0< nx)&&(j0< ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y     , z_hi-z, dx, dy, dz); }
-                if ((i0> nx)&&(j0> ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y_hi-y, z     , dx, dy, dz); }
-                if ((i0< nx)&&(j0> ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x     , y_hi-y, z_hi-z, dx, dy, dz); }
-                if ((i0> nx)&&(j0< ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y     , z_hi-z, dx, dy, dz); }
-                if ((i0> nx)&&(j0> ny)&&(k0> nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y_hi-y, z_hi-z, dx, dy, dz); }
-                if ((i0> nx)&&(j0< ny)&&(k0< nz)) { tmp_G_arr(i,j,k) = SumOfIntegratedPotential(x_hi-x, y     , z     , dx, dy, dz); }
-#endif
-         }
-      );
+    static std::unique_ptr<amrex::FFT::OpenBCSolver<amrex::Real>> obc_solver;
+    if (!obc_solver) {
+        amrex::ExecOnFinalize([&] () { obc_solver.reset(); });
     }
-
-    // Prepare to perform global FFT
-    // Since there is 1 MPI rank per box, here each MPI rank obtains its local box and the associated boxid
-    const int local_boxid = amrex::ParallelDescriptor::MyProc(); // because of how we made the DistributionMapping
-    if (local_boxid < realspace_ba.size()) {
-        // When not using heFFTe, there is only one box (the global box)
-        // It is taken care of my MPI rank 0 ; other ranks have no work (hence the if condition)
-
-        const amrex::Box local_nodal_box = realspace_ba[local_boxid];
-        amrex::Box local_box(local_nodal_box.smallEnd(), local_nodal_box.bigEnd());
-        local_box.shift(-realspace_box.smallEnd()); // This simplifies the setup because the global lo is zero now
-        // Since we the domain decompostion is in the z-direction, setting up c_local_box is simple.
-        amrex::Box c_local_box = local_box;
-        c_local_box.setBig(0, local_box.length(0)/2+1);
-
-        // Allocate array in spectral space
-        using SpectralField = amrex::BaseFab< amrex::GpuComplex< amrex::Real > > ;
-        SpectralField tmp_rho_fft(c_local_box, 1, amrex::The_Device_Arena());
-        SpectralField tmp_G_fft(c_local_box, 1, amrex::The_Device_Arena());
-        tmp_rho_fft.shift(realspace_box.smallEnd());
-        tmp_G_fft.shift(realspace_box.smallEnd());
-
-        // Create FFT plans
-        BL_PROFILE_VAR_START(timer_plans);
-#if !defined(ABLASTR_USE_HEFFTE)
-        const amrex::IntVect fft_size = realspace_ba[local_boxid].length();
-        ablastr::math::anyfft::FFTplan forward_plan_rho = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_rho[local_boxid].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_rho_fft.dataPtr()),
-            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
-        ablastr::math::anyfft::FFTplan forward_plan_G = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_G[local_boxid].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>(tmp_G_fft.dataPtr()),
-            ablastr::math::anyfft::direction::R2C, AMREX_SPACEDIM);
-        ablastr::math::anyfft::FFTplan backward_plan = ablastr::math::anyfft::CreatePlan(
-            fft_size, tmp_G[local_boxid].dataPtr(),
-            reinterpret_cast<ablastr::math::anyfft::Complex*>( tmp_G_fft.dataPtr()),
-            ablastr::math::anyfft::direction::C2R, AMREX_SPACEDIM);
-#elif defined(ABLASTR_USE_HEFFTE)
-#if     defined(AMREX_USE_CUDA)
-        heffte::fft3d_r2c<heffte::backend::cufft> fft
-#elif   defined(AMREX_USE_HIP)
-        heffte::fft3d_r2c<heffte::backend::rocfft> fft
-#else
-        heffte::fft3d_r2c<heffte::backend::fftw> fft
-#endif
-            ({{local_box.smallEnd(0), local_box.smallEnd(1), local_box.smallEnd(2)},
-            {local_box.bigEnd(0), local_box.bigEnd(1), local_box.bigEnd(2)}},
-            {{c_local_box.smallEnd(0), c_local_box.smallEnd(1), c_local_box.smallEnd(2)},
-            {c_local_box.bigEnd(0), c_local_box.bigEnd(1), c_local_box.bigEnd(2)}},
-            0, amrex::ParallelDescriptor::Communicator());
-        using heffte_complex = typename heffte::fft_output<amrex::Real>::type;
-        heffte_complex* rho_fft_data = (heffte_complex*) tmp_rho_fft.dataPtr();
-        heffte_complex* G_fft_data = (heffte_complex*) tmp_G_fft.dataPtr();
-#endif
-        BL_PROFILE_VAR_STOP(timer_plans);
-
-        // Perform forward FFTs
-        BL_PROFILE_VAR_START(timer_ffts);
-#if !defined(ABLASTR_USE_HEFFTE)
-        ablastr::math::anyfft::Execute(forward_plan_rho);
-        ablastr::math::anyfft::Execute(forward_plan_G);
-#elif defined(ABLASTR_USE_HEFFTE)
-        fft.forward(tmp_rho[local_boxid].dataPtr(), rho_fft_data);
-        fft.forward(tmp_G[local_boxid].dataPtr(), G_fft_data);
-#endif
-        BL_PROFILE_VAR_STOP(timer_ffts);
-
-        // Multiply tmp_G_fft and tmp_rho_fft in spectral space
-        // Store the result in-place in Gtmp_G_fft, to save memory
-        tmp_G_fft.template mult<amrex::RunOn::Device>(tmp_rho_fft, 0, 0, 1);
-        amrex::Gpu::streamSynchronize();
-
-        // Perform backward FFT
-        BL_PROFILE_VAR_START(timer_ffts);
-#if !defined(ABLASTR_USE_HEFFTE)
-        ablastr::math::anyfft::Execute(backward_plan);
-#elif defined(ABLASTR_USE_HEFFTE)
-        fft.backward(G_fft_data, tmp_G[local_boxid].dataPtr());
-#endif
-        BL_PROFILE_VAR_STOP(timer_ffts);
-
-#if !defined(ABLASTR_USE_HEFFTE)
-        // Loop to destroy FFT plans
-        ablastr::math::anyfft::DestroyPlan(forward_plan_G);
-        ablastr::math::anyfft::DestroyPlan(forward_plan_rho);
-        ablastr::math::anyfft::DestroyPlan(backward_plan);
-#endif
+    if (!obc_solver || obc_solver->Domain() != domain) {
+        amrex::FFT::Info info{};
+        if (do_2d_fft) { info.setBatchMode(true); }
+        info.setNumProcs(nprocs);
+        obc_solver = std::make_unique<amrex::FFT::OpenBCSolver<amrex::Real>>(domain, info);
     }
 
-     // Normalize, since (FFT + inverse FFT) results in a factor N
-    const amrex::Real normalization = 1._rt / realspace_box.numPts();
-    tmp_G.mult( normalization );
-
-    BL_PROFILE_VAR_START(timer_pcopies);
-    // Copy from tmp_G to phi
-    phi.ParallelCopy( tmp_G, 0, 0, 1, amrex::IntVect::TheZeroVector(), phi.nGrowVect());
-    BL_PROFILE_VAR_STOP(timer_pcopies);
+    auto const& lo = domain.smallEnd();
+    amrex::Real const dx = cell_size[0];
+    amrex::Real const dy = cell_size[1];
+    amrex::Real const dz = cell_size[2];
+
+    obc_solver->setGreensFunction(
+        [=] AMREX_GPU_DEVICE (int i, int j, int k) -> amrex::Real
+        {
+            int const i0 = i - lo[0];
+            int const j0 = j - lo[1];
+            int const k0 = k - lo[2];
+            amrex::Real const x = i0*dx;
+            amrex::Real const y = j0*dy;
+            amrex::Real const z = k0*dz;
+            return SumOfIntegratedPotential(x, y, z, dx, dy, dz);
+        });
+
+    obc_solver->solve(phi, rho);
 }
 } // namespace ablastr::fields
diff --git a/Tools/machines/desktop/spack-macos-openmp.yaml b/Tools/machines/desktop/spack-macos-openmp.yaml
index 3ea78625b78..820cf7069fd 100644
--- a/Tools/machines/desktop/spack-macos-openmp.yaml
+++ b/Tools/machines/desktop/spack-macos-openmp.yaml
@@ -23,7 +23,6 @@ spack:
   - conduit ~fortran
   - fftw
   - hdf5 ~fortran
-  - heffte ~cuda +fftw
   - lapackpp ~cuda ~rocm ^blaspp ~cuda +openmp ~rocm
   - mpi
   - llvm-openmp
diff --git a/Tools/machines/desktop/spack-ubuntu-cuda.yaml b/Tools/machines/desktop/spack-ubuntu-cuda.yaml
index 19b9ae12e24..08d0c95ee4b 100644
--- a/Tools/machines/desktop/spack-ubuntu-cuda.yaml
+++ b/Tools/machines/desktop/spack-ubuntu-cuda.yaml
@@ -25,7 +25,6 @@ spack:
   - cuda
   - fftw
   - hdf5
-  - heffte
   - lapackpp
   - mpi
   - pkgconfig
diff --git a/Tools/machines/desktop/spack-ubuntu-openmp.yaml b/Tools/machines/desktop/spack-ubuntu-openmp.yaml
index 1eb7d4074a7..b658f1e009d 100644
--- a/Tools/machines/desktop/spack-ubuntu-openmp.yaml
+++ b/Tools/machines/desktop/spack-ubuntu-openmp.yaml
@@ -22,7 +22,6 @@ spack:
   - ecp-data-vis-sdk +adios2 +ascent +hdf5 +sensei
   - fftw
   - hdf5
-  - heffte ~cuda +fftw
   - lapackpp ~cuda ~rocm ^blaspp ~cuda +openmp ~rocm
   - mpi
   - pkgconfig
diff --git a/Tools/machines/desktop/spack-ubuntu-rocm.yaml b/Tools/machines/desktop/spack-ubuntu-rocm.yaml
index 7eee1baa13c..45c9b0f776e 100644
--- a/Tools/machines/desktop/spack-ubuntu-rocm.yaml
+++ b/Tools/machines/desktop/spack-ubuntu-rocm.yaml
@@ -21,7 +21,6 @@ spack:
   - cmake
   - ecp-data-vis-sdk +adios2 +ascent +hdf5 +sensei
   - hdf5
-  - heffte
   - hip
   - lapackpp
   - llvm-amdgpu
diff --git a/Tools/machines/lonestar6-tacc/install_a100_dependencies.sh b/Tools/machines/lonestar6-tacc/install_a100_dependencies.sh
index cd29664a978..fd3a2d3f756 100755
--- a/Tools/machines/lonestar6-tacc/install_a100_dependencies.sh
+++ b/Tools/machines/lonestar6-tacc/install_a100_dependencies.sh
@@ -96,45 +96,6 @@ CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $HOME/src/lapackpp -B ${build_dir}/lap
 cmake --build ${build_dir}/lapackpp-a100-build --target install --parallel 16
 rm -rf ${build_dir}/lapackpp-a100-build
 
-# heFFTe
-if [ -d $HOME/src/heffte ]
-then
-  cd $HOME/src/heffte
-  git fetch --prune
-  git checkout v2.4.0
-  cd -
-else
-  git clone -b v2.4.0 https://github.com/icl-utk-edu/heffte.git ${HOME}/src/heffte
-fi
-rm -rf ${HOME}/src/heffte-a100-build
-cmake \
-    -S ${HOME}/src/heffte               \
-    -B ${build_dir}/heffte-a100-build \
-    -DBUILD_SHARED_LIBS=ON              \
-    -DCMAKE_BUILD_TYPE=Release          \
-    -DCMAKE_CXX_STANDARD=17             \
-    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON  \
-    -DCMAKE_INSTALL_PREFIX=${SW_DIR}/heffte-2.4.0  \
-    -DHeffte_DISABLE_GPU_AWARE_MPI=OFF  \
-    -DHeffte_ENABLE_AVX=OFF             \
-    -DHeffte_ENABLE_AVX512=OFF          \
-    -DHeffte_ENABLE_FFTW=OFF            \
-    -DHeffte_ENABLE_CUDA=ON             \
-    -DHeffte_ENABLE_ROCM=OFF            \
-    -DHeffte_ENABLE_ONEAPI=OFF          \
-    -DHeffte_ENABLE_MKL=OFF             \
-    -DHeffte_ENABLE_DOXYGEN=OFF         \
-    -DHeffte_SEQUENTIAL_TESTING=OFF     \
-    -DHeffte_ENABLE_TESTING=OFF         \
-    -DHeffte_ENABLE_TRACING=OFF         \
-    -DHeffte_ENABLE_PYTHON=OFF          \
-    -DHeffte_ENABLE_FORTRAN=OFF         \
-    -DHeffte_ENABLE_SWIG=OFF            \
-    -DHeffte_ENABLE_MAGMA=OFF
-cmake --build ${build_dir}/heffte-a100-build --target install --parallel 16
-rm -rf ${build_dir}/heffte-a100-build
-
-
 # Python ######################################################################
 #
 python3 -m pip install --upgrade pip
diff --git a/Tools/machines/lonestar6-tacc/lonestar6_warpx_a100.profile.example b/Tools/machines/lonestar6-tacc/lonestar6_warpx_a100.profile.example
index 148299f281c..57c98da9b4a 100644
--- a/Tools/machines/lonestar6-tacc/lonestar6_warpx_a100.profile.example
+++ b/Tools/machines/lonestar6-tacc/lonestar6_warpx_a100.profile.example
@@ -20,13 +20,11 @@ export CMAKE_PREFIX_PATH=${SW_DIR}/c-blosc-1.21.1:${CMAKE_PREFIX_PATH}
 export CMAKE_PREFIX_PATH=${SW_DIR}/adios2-2.8.3:${CMAKE_PREFIX_PATH}
 export CMAKE_PREFIX_PATH=${SW_DIR}/blaspp-2024.05.31:${CMAKE_PREFIX_PATH}
 export CMAKE_PREFIX_PATH=${SW_DIR}/lapackpp-2024.05.31:${CMAKE_PREFIX_PATH}
-export CMAKE_PREFIX_PATH=${SW_DIR}/heffte-2.4.0:${CMAKE_PREFIX_PATH}
 
 export LD_LIBRARY_PATH=${SW_DIR}/c-blosc-1.21.1/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/adios2-2.8.3/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/blaspp-2024.05.31/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/lapackpp-2024.05.31/lib64:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=${SW_DIR}/heffte-2.4.0/lib64:$LD_LIBRARY_PATH
 
 export PATH=${SW_DIR}/adios2-2.8.3/bin:${PATH}
 
diff --git a/Tools/machines/perlmutter-nersc/install_cpu_dependencies.sh b/Tools/machines/perlmutter-nersc/install_cpu_dependencies.sh
index 437300b8303..7608cb3f666 100755
--- a/Tools/machines/perlmutter-nersc/install_cpu_dependencies.sh
+++ b/Tools/machines/perlmutter-nersc/install_cpu_dependencies.sh
@@ -107,45 +107,6 @@ CXX=$(which CC) CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $HOME/src/lapackpp -B
 cmake --build ${build_dir}/lapackpp-pm-cpu-build --target install --parallel 16
 rm -rf ${build_dir}/lapackpp-pm-cpu-build
 
-# heFFTe
-if [ -d $HOME/src/heffte ]
-then
-  cd $HOME/src/heffte
-  git fetch --prune
-  git checkout v2.4.0
-  cd -
-else
-  git clone -b v2.4.0 https://github.com/icl-utk-edu/heffte.git ${HOME}/src/heffte
-fi
-rm -rf ${HOME}/src/heffte-pm-cpu-build
-cmake \
-    -S ${HOME}/src/heffte               \
-    -B ${build_dir}/heffte-pm-cpu-build \
-    -DBUILD_SHARED_LIBS=ON              \
-    -DCMAKE_BUILD_TYPE=Release          \
-    -DCMAKE_CXX_STANDARD=17             \
-    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON  \
-    -DCMAKE_INSTALL_PREFIX=${SW_DIR}/heffte-2.4.0  \
-    -DHeffte_DISABLE_GPU_AWARE_MPI=ON   \
-    -DHeffte_ENABLE_AVX=ON              \
-    -DHeffte_ENABLE_AVX512=OFF          \
-    -DHeffte_ENABLE_FFTW=ON             \
-    -DHeffte_ENABLE_CUDA=OFF            \
-    -DHeffte_ENABLE_ROCM=OFF            \
-    -DHeffte_ENABLE_ONEAPI=OFF          \
-    -DHeffte_ENABLE_MKL=OFF             \
-    -DHeffte_ENABLE_DOXYGEN=OFF         \
-    -DHeffte_SEQUENTIAL_TESTING=OFF     \
-    -DHeffte_ENABLE_TESTING=OFF         \
-    -DHeffte_ENABLE_TRACING=OFF         \
-    -DHeffte_ENABLE_PYTHON=OFF          \
-    -DHeffte_ENABLE_FORTRAN=OFF         \
-    -DHeffte_ENABLE_SWIG=OFF            \
-    -DHeffte_ENABLE_MAGMA=OFF
-cmake --build ${build_dir}/heffte-pm-cpu-build --target install --parallel 16
-rm -rf ${build_dir}/heffte-pm-cpu-build
-
-
 # Python ######################################################################
 #
 python3 -m pip install --upgrade pip
diff --git a/Tools/machines/perlmutter-nersc/install_gpu_dependencies.sh b/Tools/machines/perlmutter-nersc/install_gpu_dependencies.sh
index c77f075a3a8..d08ca7457d4 100755
--- a/Tools/machines/perlmutter-nersc/install_gpu_dependencies.sh
+++ b/Tools/machines/perlmutter-nersc/install_gpu_dependencies.sh
@@ -107,49 +107,6 @@ CXX=$(which CC) CXXFLAGS="-DLAPACK_FORTRAN_ADD_" cmake -S $HOME/src/lapackpp -B
 cmake --build ${build_dir}/lapackpp-pm-gpu-build --target install --parallel 16
 rm -rf ${build_dir}/lapackpp-pm-gpu-build
 
-# heFFTe
-if [ -d $HOME/src/heffte ]
-then
-  cd $HOME/src/heffte
-  git fetch --prune
-  git checkout v2.4.0
-  cd -
-else
-  git clone -b v2.4.0 https://github.com/icl-utk-edu/heffte.git ${HOME}/src/heffte
-fi
-rm -rf ${HOME}/src/heffte-pm-gpu-build
-cmake \
-    -S ${HOME}/src/heffte               \
-    -B ${build_dir}/heffte-pm-gpu-build \
-    -DBUILD_SHARED_LIBS=ON              \
-    -DCMAKE_BUILD_TYPE=Release          \
-    -DCMAKE_CXX_STANDARD=17             \
-    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON  \
-    -DCMAKE_INSTALL_PREFIX=${SW_DIR}/heffte-2.4.0  \
-    -DHeffte_DISABLE_GPU_AWARE_MPI=OFF  \
-    -DHeffte_ENABLE_AVX=OFF             \
-    -DHeffte_ENABLE_AVX512=OFF          \
-    -DHeffte_ENABLE_FFTW=OFF            \
-    -DHeffte_ENABLE_CUDA=ON             \
-    -DHeffte_ENABLE_ROCM=OFF            \
-    -DHeffte_ENABLE_ONEAPI=OFF          \
-    -DHeffte_ENABLE_MKL=OFF             \
-    -DHeffte_ENABLE_DOXYGEN=OFF         \
-    -DHeffte_SEQUENTIAL_TESTING=OFF     \
-    -DHeffte_ENABLE_TESTING=OFF         \
-    -DHeffte_ENABLE_TRACING=OFF         \
-    -DHeffte_ENABLE_PYTHON=OFF          \
-    -DHeffte_ENABLE_FORTRAN=OFF         \
-    -DHeffte_ENABLE_SWIG=OFF            \
-    -DHeffte_ENABLE_MAGMA=OFF
-cmake --build ${build_dir}/heffte-pm-gpu-build --target install --parallel 16
-rm -rf ${build_dir}/heffte-pm-gpu-build
-
-# work-around for heFFTe 2.4.0 bug with NVCC
-# https://github.com/icl-utk-edu/heffte/pull/54
-sed -i 's/__AVX__/NOTDEFINED_DONOTUSE/g' ${SW_DIR}/heffte-2.4.0/include/stock_fft/heffte_stock_vec_types.h
-
-
 # Python ######################################################################
 #
 python3 -m pip install --upgrade pip
diff --git a/Tools/machines/perlmutter-nersc/perlmutter_cpu_warpx.profile.example b/Tools/machines/perlmutter-nersc/perlmutter_cpu_warpx.profile.example
index 94d598abf5b..99817924ad6 100644
--- a/Tools/machines/perlmutter-nersc/perlmutter_cpu_warpx.profile.example
+++ b/Tools/machines/perlmutter-nersc/perlmutter_cpu_warpx.profile.example
@@ -19,13 +19,11 @@ export CMAKE_PREFIX_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/c-blosc-1.21.1
 export CMAKE_PREFIX_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/blaspp-2024.05.31:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/lapackpp-2024.05.31:$CMAKE_PREFIX_PATH
-export CMAKE_PREFIX_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/heffte-2.4.0:$CMAKE_PREFIX_PATH
 
 export LD_LIBRARY_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/c-blosc-1.21.1/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/adios2-2.8.3/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/blaspp-2024.05.31/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/lapackpp-2024.05.31/lib64:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/heffte-2.4.0/lib64:$LD_LIBRARY_PATH
 
 export PATH=${CFS}/${proj}/${USER}/sw/perlmutter/cpu/adios2-2.8.3/bin:${PATH}
 
diff --git a/Tools/machines/perlmutter-nersc/perlmutter_gpu_warpx.profile.example b/Tools/machines/perlmutter-nersc/perlmutter_gpu_warpx.profile.example
index da1d55964d1..1e5325e29b9 100644
--- a/Tools/machines/perlmutter-nersc/perlmutter_gpu_warpx.profile.example
+++ b/Tools/machines/perlmutter-nersc/perlmutter_gpu_warpx.profile.example
@@ -23,13 +23,11 @@ export CMAKE_PREFIX_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/c-blosc-1.2
 export CMAKE_PREFIX_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/adios2-2.8.3:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/blaspp-2024.05.31:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/lapackpp-2024.05.31:$CMAKE_PREFIX_PATH
-export CMAKE_PREFIX_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/heffte-2.4.0:$CMAKE_PREFIX_PATH
 
 export LD_LIBRARY_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/c-blosc-1.21.1/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/adios2-2.8.3/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/blaspp-2024.05.31/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/lapackpp-2024.05.31/lib64:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/heffte-2.4.0/lib64:$LD_LIBRARY_PATH
 
 export PATH=${CFS}/${proj%_g}/${USER}/sw/perlmutter/gpu/adios2-2.8.3/bin:${PATH}
 
diff --git a/Tools/machines/tioga-llnl/install_mi300a_dependencies.sh b/Tools/machines/tioga-llnl/install_mi300a_dependencies.sh
index 7e002838e4a..95633549698 100644
--- a/Tools/machines/tioga-llnl/install_mi300a_dependencies.sh
+++ b/Tools/machines/tioga-llnl/install_mi300a_dependencies.sh
@@ -143,48 +143,6 @@ cmake \
     --parallel ${build_procs}
 rm -rf ${build_dir}/lapackpp-tioga-mi300a-build
 
-# heFFTe
-if [ -d ${SRC_DIR}/heffte ]
-then
-  cd ${SRC_DIR}/heffte
-  git fetch --prune
-  git checkout v2.4.0
-  cd -
-else
-  git clone -b v2.4.0 https://github.com/icl-utk-edu/heffte.git ${SRC_DIR}/heffte
-fi
-cmake \
-    --fresh                             \
-    -S ${SRC_DIR}/heffte                \
-    -B ${build_dir}/heffte-build        \
-    -DBUILD_SHARED_LIBS=ON              \
-    -DCMAKE_BUILD_TYPE=Release          \
-    -DCMAKE_CXX_STANDARD=17             \
-    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON         \
-    -DCMAKE_INSTALL_PREFIX=${SW_DIR}/heffte-2.4.0  \
-    -DHeffte_DISABLE_GPU_AWARE_MPI=OFF  \
-    -DHeffte_ENABLE_AVX=OFF             \
-    -DHeffte_ENABLE_AVX512=OFF          \
-    -DHeffte_ENABLE_FFTW=OFF            \
-    -DHeffte_ENABLE_CUDA=OFF            \
-    -DHeffte_ENABLE_ROCM=ON             \
-    -DHeffte_ENABLE_ONEAPI=OFF          \
-    -DHeffte_ENABLE_MKL=OFF             \
-    -DHeffte_ENABLE_DOXYGEN=OFF         \
-    -DHeffte_SEQUENTIAL_TESTING=OFF     \
-    -DHeffte_ENABLE_TESTING=OFF         \
-    -DHeffte_ENABLE_TRACING=OFF         \
-    -DHeffte_ENABLE_PYTHON=OFF          \
-    -DHeffte_ENABLE_FORTRAN=OFF         \
-    -DHeffte_ENABLE_SWIG=OFF            \
-    -DHeffte_ENABLE_MAGMA=OFF
-cmake \
-    --build ${build_dir}/heffte-build   \
-    --target install                    \
-    --parallel ${build_procs}
-rm -rf ${build_dir}/heffte-build
-
-
 # Python ######################################################################
 #
 # sometimes, the Lassen PIP Index is down
diff --git a/Tools/machines/tioga-llnl/tioga_mi300a_warpx.profile.example b/Tools/machines/tioga-llnl/tioga_mi300a_warpx.profile.example
index e3da37c5522..53fe21844c1 100644
--- a/Tools/machines/tioga-llnl/tioga_mi300a_warpx.profile.example
+++ b/Tools/machines/tioga-llnl/tioga_mi300a_warpx.profile.example
@@ -31,13 +31,11 @@ export CMAKE_PREFIX_PATH=${SW_DIR}/c-blosc-2.15.1:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${SW_DIR}/adios2-2.10.1:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${SW_DIR}/blaspp-2024.05.31:$CMAKE_PREFIX_PATH
 export CMAKE_PREFIX_PATH=${SW_DIR}/lapackpp-2024.05.31:$CMAKE_PREFIX_PATH
-export CMAKE_PREFIX_PATH=${SW_DIR}/heffte-2.4.0:$CMAKE_PREFIX_PATH
 
 export LD_LIBRARY_PATH=${SW_DIR}/c-blosc-2.15.1/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/adios2-2.10.1/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/blaspp-2024.05.31/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${SW_DIR}/lapackpp-2024.05.31/lib64:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=${SW_DIR}/heffte-2.4.0/lib64:$LD_LIBRARY_PATH
 
 export PATH=${SW_DIR}/adios2-2.10.1/bin:${PATH}
 
diff --git a/cmake/WarpXFunctions.cmake b/cmake/WarpXFunctions.cmake
index 43efd89efc5..543d0cd0ce4 100644
--- a/cmake/WarpXFunctions.cmake
+++ b/cmake/WarpXFunctions.cmake
@@ -313,10 +313,6 @@ function(set_warpx_binary_name D)
             set_property(TARGET ${tgt} APPEND_STRING PROPERTY OUTPUT_NAME ".FFT")
         endif()
 
-        if(WarpX_HEFFTE)
-            set_property(TARGET ${tgt} APPEND_STRING PROPERTY OUTPUT_NAME ".HEFFTE")
-        endif()
-
         if(WarpX_EB)
             set_property(TARGET ${tgt} APPEND_STRING PROPERTY OUTPUT_NAME ".EB")
         endif()
@@ -462,7 +458,6 @@ function(warpx_print_summary)
     message("    PARTICLE PRECISION: ${WarpX_PARTICLE_PRECISION}")
     message("    PRECISION: ${WarpX_PRECISION}")
     message("    FFT Solvers: ${WarpX_FFT}")
-    message("    heFFTe: ${WarpX_HEFFTE}")
     message("    PYTHON: ${WarpX_PYTHON}")
     if(WarpX_PYTHON)
         message("    PYTHON IPO: ${WarpX_PYTHON_IPO}")
diff --git a/cmake/dependencies/AMReX.cmake b/cmake/dependencies/AMReX.cmake
index e1072d03014..5265c152c27 100644
--- a/cmake/dependencies/AMReX.cmake
+++ b/cmake/dependencies/AMReX.cmake
@@ -51,6 +51,12 @@ macro(find_amrex)
             set(AMReX_OMP          OFF    CACHE INTERNAL "")
         endif()
 
+        if(WarpX_FFT)
+            set(AMReX_FFT ON CACHE INTERNAL "")
+        else()
+            set(AMReX_FFT OFF CACHE INTERNAL "")
+        endif()
+
         if(WarpX_EB)
             set(AMReX_EB ON CACHE INTERNAL "")
         else()
@@ -243,6 +249,11 @@ macro(find_amrex)
         foreach(D IN LISTS WarpX_amrex_dim)
             set(COMPONENT_DIMS ${COMPONENT_DIMS} ${D}D)
         endforeach()
+        if(WarpX_FFT)
+            set(COMPONENT_FFT FFT)
+        else()
+            set(COMPONENT_FFT)
+        endif()
         if(WarpX_EB)
             set(COMPONENT_EB EB)
         else()
@@ -260,7 +271,7 @@ macro(find_amrex)
         endif()
         set(COMPONENT_PRECISION ${WarpX_PRECISION} P${WarpX_PARTICLE_PRECISION})
 
-        find_package(AMReX 24.11 CONFIG REQUIRED COMPONENTS ${COMPONENT_ASCENT} ${COMPONENT_CATALYST} ${COMPONENT_DIMS} ${COMPONENT_EB} PARTICLES ${COMPONENT_PIC} ${COMPONENT_PRECISION} ${COMPONENT_SENSEI} LSOLVERS)
+        find_package(AMReX 294b6fee6f0c7f44693eac14e6b0c0702ecfd791 CONFIG REQUIRED COMPONENTS ${COMPONENT_ASCENT} ${COMPONENT_CATALYST} ${COMPONENT_DIMS} ${COMPONENT_FFT} ${COMPONENT_EB} PARTICLES ${COMPONENT_PIC} ${COMPONENT_PRECISION} ${COMPONENT_SENSEI} LSOLVERS)
         # note: TINYP skipped because user-configured and optional
 
         # AMReX CMake helper scripts
@@ -283,7 +294,7 @@ set(WarpX_amrex_src ""
 set(WarpX_amrex_repo "https://github.com/AMReX-Codes/amrex.git"
     CACHE STRING
     "Repository URI to pull and build AMReX from if(WarpX_amrex_internal)")
-set(WarpX_amrex_branch "4b703fec6c2ff983e465c8cef0cc4947231edb07"
+set(WarpX_amrex_branch "294b6fee6f0c7f44693eac14e6b0c0702ecfd791"
     CACHE STRING
     "Repository branch for WarpX_amrex_repo if(WarpX_amrex_internal)")
 
diff --git a/setup.py b/setup.py
index fc99b75f2f0..cdb8a6d844e 100644
--- a/setup.py
+++ b/setup.py
@@ -105,7 +105,6 @@ def build_extension(self, ext):
             "-DWarpX_PRECISION=" + WARPX_PRECISION,
             "-DWarpX_PARTICLE_PRECISION=" + WARPX_PARTICLE_PRECISION,
             "-DWarpX_FFT:BOOL=" + WARPX_FFT,
-            "-DWarpX_HEFFTE:BOOL=" + WARPX_HEFFTE,
             "-DWarpX_PYTHON:BOOL=ON",
             "-DWarpX_PYTHON_IPO:BOOL=" + WARPX_PYTHON_IPO,
             "-DWarpX_QED:BOOL=" + WARPX_QED,
@@ -208,7 +207,6 @@ def build_extension(self, ext):
 WARPX_PRECISION = env.pop("WARPX_PRECISION", "DOUBLE")
 WARPX_PARTICLE_PRECISION = env.pop("WARPX_PARTICLE_PRECISION", WARPX_PRECISION)
 WARPX_FFT = env.pop("WARPX_FFT", "OFF")
-WARPX_HEFFTE = env.pop("WARPX_HEFFTE", "OFF")
 WARPX_QED = env.pop("WARPX_QED", "ON")
 WARPX_QED_TABLE_GEN = env.pop("WARPX_QED_TABLE_GEN", "OFF")
 WARPX_DIMS = env.pop("WARPX_DIMS", "1;2;RZ;3")