diff --git a/api/MWOperators b/api/MWOperators
index b61ee9075..3d6bc2793 100644
--- a/api/MWOperators
+++ b/api/MWOperators
@@ -38,3 +38,4 @@
 #include "operators/BSOperator.h"
 
 #include "treebuilders/apply.h"
+#include "treebuilders/DerivativeCalculator.h"
diff --git a/docs/programmers_manual/MWTree.rst b/docs/programmers_manual/MWTree.rst
index 67614b631..83abc383e 100644
--- a/docs/programmers_manual/MWTree.rst
+++ b/docs/programmers_manual/MWTree.rst
@@ -16,4 +16,3 @@ algorithm/equation/structure reasoning for having this class or where it fits wi
    :members:
    :protected-members:
    :private-members:
-
diff --git a/docs/programmers_manual/MultiResolutionAnalysis.rst b/docs/programmers_manual/MultiResolutionAnalysis.rst
new file mode 100644
index 000000000..dfec65d66
--- /dev/null
+++ b/docs/programmers_manual/MultiResolutionAnalysis.rst
@@ -0,0 +1,16 @@
+------------------
+MultiResolutionAnalysis
+------------------
+
+The MultiResolutionAnalysis (MRA) class contains the methods to project objects onto the spatial grid.
+That is, to combine different functions and operators in mathematical operations, they need to be compatible; 
+they must be defined on the same computational domain and constructed using the same polynomial basis 
+(order and type). This information constitutes an MRA, which needs to be defined and passed as argument to 
+all function and operator constructors, and only functions and operators with compatible MRAs can be 
+combined in subsequent calculations.
+
+.. doxygenclass:: mrcpp::MultiResolutionAnalysis
+   :members:
+   :protected-members:
+   :private-members:
+
diff --git a/docs/programmers_manual/overview.rst b/docs/programmers_manual/overview.rst
index 7ff6ff4cf..e78ab5cb7 100644
--- a/docs/programmers_manual/overview.rst
+++ b/docs/programmers_manual/overview.rst
@@ -13,6 +13,7 @@ TODO: maybe add some low level theory/figures/algorithms before showing classes,
    OperatorNode
    OperatorTree
    BoundingBox
+   MultiResolutionAnalysis
    CrossCorrelationCache
    LegendreBasis
    InterpolatingBasis
@@ -20,12 +21,12 @@ TODO: maybe add some low level theory/figures/algorithms before showing classes,
    ConvolutionCalculator
    
    CornerOperatorTree
+
    TimeEvolutionOperator
    JpowerIntegrals
    special_functions
    SchrodingerEvolution_CrossCorrelation
    TimeEvolution_CrossCorrelationCalculator
    complex_apply
-
    HeatOperator
    HeatKernel
diff --git a/src/functions/RepresentableFunction.h b/src/functions/RepresentableFunction.h
index 524484868..2d6998812 100644
--- a/src/functions/RepresentableFunction.h
+++ b/src/functions/RepresentableFunction.h
@@ -31,11 +31,13 @@
 
 #pragma once
 
+#include <Eigen/Core>
 #include <iostream>
 #include <vector>
 
 #include "MRCPP/constants.h"
 #include "MRCPP/mrcpp_declarations.h"
+#include "trees/NodeIndex.h"
 
 namespace mrcpp {
 
@@ -74,4 +76,15 @@ template <int D> class RepresentableFunction {
     virtual bool isZeroOnInterval(const double *a, const double *b) const { return false; }
 };
 
+/*
+ * Same as RepresentableFunction, but output a matrix of values
+ * for all points in a node, given its NodeIndex.
+ *
+ */
+class RepresentableFunction_M {
+public:
+    RepresentableFunction_M() {}
+    virtual Eigen::MatrixXd evalf(mrcpp::NodeIndex<3> nIdx) const = 0;
+};
+
 } // namespace mrcpp
diff --git a/src/operators/IdentityConvolution.cpp b/src/operators/IdentityConvolution.cpp
index 8e9bea624..5b8bde3af 100644
--- a/src/operators/IdentityConvolution.cpp
+++ b/src/operators/IdentityConvolution.cpp
@@ -29,7 +29,7 @@
 
 namespace mrcpp {
 
-/** @brief Constructor  of the IdentityConvolution object
+/** @brief Constructor of the IdentityConvolution object
  *  @returns New IdentityConvolution object
  *  @param[in] mra: Which MRA the operator is defined
  *  @param[in] prec: Build precision, closeness to delta function
diff --git a/src/treebuilders/DerivativeCalculator.cpp b/src/treebuilders/DerivativeCalculator.cpp
index 97b82414f..a5acdc297 100644
--- a/src/treebuilders/DerivativeCalculator.cpp
+++ b/src/treebuilders/DerivativeCalculator.cpp
@@ -86,6 +86,34 @@ template <int D> void DerivativeCalculator<D>::printTimers() const {
     Printer::setPrecision(oldprec);
 }
 
+template <int D> void DerivativeCalculator<D>::calcNode(MWNode<D> &inpNode, MWNode<D> &outNode) {
+    //if (this->oper->getMaxBandWidth() > 1) MSG_ABORT("Only implemented for zero bw");
+    outNode.zeroCoefs();
+    int nComp = (1 << D);
+    double tmpCoefs[outNode.getNCoefs()];
+    OperatorState<D> os(outNode, tmpCoefs);
+
+    os.setFNode(inpNode);
+    os.setFIndex(inpNode.nodeIndex);
+    for (int ft = 0; ft < nComp; ft++) {
+        double fNorm = inpNode.getComponentNorm(ft);
+        if (fNorm < MachineZero) { continue; } // Could this be used outside the loop?
+        os.setFComponent(ft);
+        for (int gt = 0; gt < nComp; gt++) {
+            os.setGComponent(gt);
+            applyOperator_bw0(os);
+       }
+    }
+   // Multiply appropriate scaling factor. TODO: Could be included elsewhere
+    const double scaling_factor =
+        1.0/std::pow(outNode.getMWTree().getMRA().getWorldBox().getScalingFactor(this->applyDir), oper->getOrder());
+    if(abs(scaling_factor-1.0)>MachineZero){
+        for (int i = 0; i < outNode.getNCoefs(); i++) outNode.getCoefs()[i] *= scaling_factor;
+    }
+    outNode.calcNorms(); //TODO:required? norms are not used for now
+}
+
+
 template <int D> void DerivativeCalculator<D>::calcNode(MWNode<D> &gNode) {
     gNode.zeroCoefs();
 
@@ -101,6 +129,7 @@ template <int D> void DerivativeCalculator<D>::calcNode(MWNode<D> &gNode) {
     this->band_t[mrcpp_get_thread_num()].stop();
 
     this->calc_t[mrcpp_get_thread_num()].resume();
+
     for (int n = 0; n < fBand.size(); n++) {
         MWNode<D> &fNode = *fBand[n];
         NodeIndex<D> &fIdx = idx_band[n];
@@ -152,6 +181,39 @@ MWNodeVector<D> DerivativeCalculator<D>::makeOperBand(const MWNode<D> &gNode, st
     return band;
 }
 
+/** Apply a single operator component (term) to a single f-node assuming zero bandwidth */
+template <int D> void DerivativeCalculator<D>::applyOperator_bw0(OperatorState<D> &os) {
+    //cout<<" applyOperator "<<endl;
+    MWNode<D> &gNode = *os.gNode;
+    MWNode<D> &fNode = *os.fNode;
+    const NodeIndex<D> &fIdx = *os.fIdx;
+    const NodeIndex<D> &gIdx = gNode.getNodeIndex();
+    int depth = gNode.getDepth();
+
+    double oNorm = 1.0;
+    double **oData = os.getOperData();
+
+    for (int d = 0; d < D; d++) {
+        const OperatorTree &oTree = this->oper->getComponent(0, d);
+        const OperatorNode &oNode = oTree.getNode(depth, 0);
+        int oIdx = os.getOperIndex(d);
+        if (this->applyDir == d) {
+            oData[d] = const_cast<double *>(oNode.getCoefs()) + oIdx * os.kp1_2;
+        } else {
+            if (oIdx == 0 or oIdx == 3) {
+                // This will activate the identity operator in direction i
+                oData[d] = nullptr;
+            } else {
+                // This means that we are in a zero part of the identity operator
+                return;
+            }
+        }
+    }
+    this->operStat.incrementFNodeCounters(fNode, os.ft, os.gt);
+    tensorApplyOperComp(os);
+}
+
+
 /** Apply a single operator component (term) to a single f-node. Whether the
 operator actualy is applied is determined by a screening threshold. */
 template <int D> void DerivativeCalculator<D>::applyOperator(OperatorState<D> &os) {
@@ -197,8 +259,8 @@ template <int D> void DerivativeCalculator<D>::applyOperator(OperatorState<D> &o
     tensorApplyOperComp(os);
 }
 
-/** Perorm the required linear algebra operations in order to apply an
-operator component to a f-node in a n-dimensional tesor space. */
+/** Perform the required linear algebra operations in order to apply an
+operator component to a f-node in a n-dimensional tensor space. */
 template <int D> void DerivativeCalculator<D>::tensorApplyOperComp(OperatorState<D> &os) {
     double **aux = os.getAuxData();
     double **oData = os.getOperData();
diff --git a/src/treebuilders/DerivativeCalculator.h b/src/treebuilders/DerivativeCalculator.h
index 67dbac01c..5d4d28716 100644
--- a/src/treebuilders/DerivativeCalculator.h
+++ b/src/treebuilders/DerivativeCalculator.h
@@ -36,6 +36,7 @@ template <int D> class DerivativeCalculator final : public TreeCalculator<D> {
     ~DerivativeCalculator() override;
 
     MWNodeVector<D> *getInitialWorkVector(MWTree<D> &tree) const override;
+    void calcNode(MWNode<D> &fNode, MWNode<D> &gNode);
 
 private:
     int applyDir;
@@ -61,6 +62,7 @@ template <int D> class DerivativeCalculator final : public TreeCalculator<D> {
     }
 
     void applyOperator(OperatorState<D> &os);
+    void applyOperator_bw0(OperatorState<D> &os);
     void tensorApplyOperComp(OperatorState<D> &os);
 };
 
diff --git a/src/treebuilders/apply.cpp b/src/treebuilders/apply.cpp
index ce5c08e53..3dc49de3c 100644
--- a/src/treebuilders/apply.cpp
+++ b/src/treebuilders/apply.cpp
@@ -256,6 +256,7 @@ template <int D> void apply_far_field(double prec, FunctionTree<D> &out, Convolu
 template <int D> void apply_near_field(double prec, FunctionTree<D> &out, ConvolutionOperator<D> &oper, FunctionTree<D> &inp, int maxIter, bool absPrec) {
     apply_on_unit_cell<D>(true, prec, out, oper, inp, maxIter, absPrec);
 }
+
 /** @brief Application of MW derivative operator
  *
  * @param[out] out: Output function to be built
@@ -274,7 +275,6 @@ template <int D> void apply_near_field(double prec, FunctionTree<D> &out, Convol
  */
 template <int D> void apply(FunctionTree<D> &out, DerivativeOperator<D> &oper, FunctionTree<D> &inp, int dir) {
     if (out.getMRA() != inp.getMRA()) MSG_ABORT("Incompatible MRA");
-
     TreeBuilder<D> builder;
     int maxScale = out.getMRA().getMaxScale();
 
diff --git a/src/trees/FunctionNode.cpp b/src/trees/FunctionNode.cpp
index 2c8eee329..c839e2b57 100644
--- a/src/trees/FunctionNode.cpp
+++ b/src/trees/FunctionNode.cpp
@@ -130,7 +130,6 @@ template <int D> double FunctionNode<D>::integrateInterpolating() const {
     int qOrder = this->getKp1();
     getQuadratureCache(qc);
     const VectorXd &weights = qc.getWeights(qOrder);
-
     double sqWeights[qOrder];
     for (int i = 0; i < qOrder; i++) sqWeights[i] = std::sqrt(weights[i]);
 
@@ -158,6 +157,52 @@ template <int D> double FunctionNode<D>::integrateInterpolating() const {
     return two_n * sum;
 }
 
+/** Function integration, Interpolating basis.
+ *
+ * Integrates the function represented on the node on the full support of the
+ * node. A bit more involved than in the Legendre basis, as is requires some
+ * coupling of quadrature weights. */
+template <int D> double FunctionNode<D>::integrateValues() const {
+    int qOrder = this->getKp1();
+    getQuadratureCache(qc);
+    const VectorXd &weights = qc.getWeights(qOrder);
+    VectorXd coefs;
+    this->getCoefs(coefs);
+    int ncoefs = coefs.size();
+    int ncoefChild = ncoefs/(1<<D);
+    double cc[ncoefChild];
+    // factorize out the children
+    for (int i = 0; i < ncoefChild; i++)cc[i]=coefs[i];
+    for (int j = 1; j < (1<<D); j++) for (int i = 0; i < ncoefChild; i++)cc[i]+=coefs[j*ncoefChild+i];
+
+    int nc = 0;
+    double sum = 0.0;
+    if (D > 3) MSG_ABORT("Not Implemented")
+    else if (D == 3) {
+        for (int i = 0; i < qOrder; i++) {
+            double sumj = 0.0;
+            for (int j = 0; j < qOrder; j++) {
+                double sumk = 0.0;
+                for (int k = 0; k < qOrder; k++) sumk += cc[nc++] * weights[k];
+                sumj += sumk * weights[j];
+            }
+            sum += sumj * weights[i];
+        }
+    } else if (D==2) {
+        for (int j = 0; j < qOrder; j++) {
+                double sumk = 0.0;
+                for (int k = 0; k < qOrder; k++) sumk += cc[nc++] * weights[k];
+                sum += sumk * weights[j];
+        }
+    } else if (D==1) for (int k = 0; k < qOrder; k++) sum += cc[nc++] * weights[k];
+
+    int n = D * (this->getScale() + 1) ; // NB: one extra scale
+    int two_n = (1<<abs(n)); // 2**n;
+    if(n>0)sum/=two_n;
+    else sum*=two_n;
+    return sum;
+}
+
 template <int D> void FunctionNode<D>::setValues(const VectorXd &vec) {
     this->zeroCoefs();
     this->setCoefBlock(0, vec.size(), vec.data());
@@ -319,6 +364,9 @@ template <int D> void FunctionNode<D>::dealloc() {
     auto &ftree = this->getFuncTree();
     if (this->isGenNode()) {
         ftree.getGenNodeAllocator().dealloc(sIdx);
+        // for GenNodes we clean unused chunks carefully, as they can become
+        // very large and occupy space long after used.
+        ftree.getGenNodeAllocator().deleteUnusedChunks();
     } else {
         ftree.decrementNodeCount(this->getScale());
         ftree.getNodeAllocator().dealloc(sIdx);
diff --git a/src/trees/FunctionNode.h b/src/trees/FunctionNode.h
index c110e12d8..97a3d74d3 100644
--- a/src/trees/FunctionNode.h
+++ b/src/trees/FunctionNode.h
@@ -77,6 +77,7 @@ template <int D> class FunctionNode final : public MWNode<D> {
 
     double integrateLegendre() const;
     double integrateInterpolating() const;
+    double integrateValues() const;
 };
 
 template <int D> double dot_scaling(const FunctionNode<D> &bra, const FunctionNode<D> &ket);
diff --git a/src/trees/FunctionTree.cpp b/src/trees/FunctionTree.cpp
index 6d27270ac..47614a933 100644
--- a/src/trees/FunctionTree.cpp
+++ b/src/trees/FunctionTree.cpp
@@ -54,7 +54,7 @@ template <int D>
 FunctionTree<D>::FunctionTree(const MultiResolutionAnalysis<D> &mra, SharedMemory *sh_mem, const std::string &name)
         : MWTree<D>(mra, name)
         , RepresentableFunction<D>(mra.getWorldBox().getLowerBounds().data(), mra.getWorldBox().getUpperBounds().data()) {
-    int nodesPerChunk = 64;
+    int nodesPerChunk = 2048; // Large chunks are required for not leading to memory fragmentation (32 MB on "Betzy" 2023)
     int coefsGenNodes = this->getKp1_d();
     int coefsRegNodes = this->getTDim() * this->getKp1_d();
     this->nodeAllocator_p = std::make_unique<NodeAllocator<D>>(this, sh_mem, coefsRegNodes, nodesPerChunk);
@@ -186,6 +186,42 @@ template <int D> double FunctionTree<D>::integrate() const {
     return jacobian * result;
 }
 
+
+/** @returns Integral of a representable function over the grid given by the tree */
+template <> double FunctionTree<3>::integrateEndNodes(RepresentableFunction_M &f) {
+    //traverse tree, and treat end nodes only
+    std::vector<FunctionNode<3> *> stack;   // node from this
+    for (int i = 0; i < this->getRootBox().size(); i++) stack.push_back(&(this->getRootFuncNode(i)));
+    int basis = getMRA().getScalingBasis().getScalingType();
+    double result = 0.0;
+    while (stack.size() > 0) {
+        FunctionNode<3> *Node = stack.back();
+        stack.pop_back();
+        if (Node->getNChildren() > 0) {
+            for (int i = 0; i < Node->getNChildren(); i++) stack.push_back(&(Node->getFuncChild(i)));
+        } else {
+            //end nodes
+            Eigen::MatrixXd fmat = f.evalf(Node->nodeIndex);
+            double *coefs = Node->getCoefs(); // save position of coeff, but do not use them!
+            // The data in fmat is not organized so that two consecutive points are stored after each other in memory, so needs to copy before mwtransform, cannot use memory adress directly.
+            int nc=fmat.cols();
+            double cc[nc];
+            for (int i = 0; i < nc; i++)cc[i]=fmat(0,i);
+            Node->attachCoefs(cc);
+            result += Node->integrateValues();
+            Node->attachCoefs(coefs); // put back original coeff
+        }
+    }
+
+    // Handle potential scaling
+    auto scaling_factor = this->getMRA().getWorldBox().getScalingFactors();
+    auto jacobian = 1.0;
+    for (const auto &sf_i : scaling_factor) jacobian *= std::sqrt(sf_i);
+    // Square root of scaling factor in each diection. The seemingly missing
+    // multiplication by the square root of sf_i is included in the basis
+    return jacobian * result;
+}
+
 /** @returns Function value in a point, out of bounds returns zero
  *
  * @param[in] r: Cartesian coordinate
diff --git a/src/trees/FunctionTree.h b/src/trees/FunctionTree.h
index 613399109..0be9563ea 100644
--- a/src/trees/FunctionTree.h
+++ b/src/trees/FunctionTree.h
@@ -62,6 +62,7 @@ template <int D> class FunctionTree final : public MWTree<D>, public Representab
     ~FunctionTree() override;
 
     double integrate() const;
+    double integrateEndNodes(RepresentableFunction_M &f);
     double evalf_precise(const Coord<D> &r);
     double evalf(const Coord<D> &r) const override;
 
diff --git a/src/trees/MWNode.cpp b/src/trees/MWNode.cpp
index 927019f63..d15c0939f 100644
--- a/src/trees/MWNode.cpp
+++ b/src/trees/MWNode.cpp
@@ -103,7 +103,7 @@ MWNode<D>::MWNode(MWTree<D> *tree, int rIdx)
 /** @brief MWNode constructor.
  *
  * @param[in] parent: parent node
- * @param[in] cIdx: child index of the current node 
+ * @param[in] cIdx: child index of the current node
  *
  * @details Constructor for leaf nodes. It requires the corresponding
  * parent and an integer to identify the correct child.
@@ -131,7 +131,7 @@ MWNode<D>::MWNode(MWNode<D> *parent, int cIdx)
  * the tree.
  */
 template <int D>
-MWNode<D>::MWNode(const MWNode<D> &node, bool allocCoef)
+MWNode<D>::MWNode(const MWNode<D> &node, bool allocCoef, bool SetCoef)
         : tree(node.tree)
         , parent(nullptr)
         , nodeIndex(node.nodeIndex)
@@ -141,7 +141,7 @@ MWNode<D>::MWNode(const MWNode<D> &node, bool allocCoef)
     setIsLooseNode();
     if (allocCoef) {
         allocCoefs(this->getTDim(), this->getKp1_d());
-        if (node.hasCoefs()) {
+        if (node.hasCoefs() and SetCoef) {
             setCoefBlock(0, node.getNCoefs(), node.getCoefs());
             if (this->getNCoefs() > node.getNCoefs()) {
                 for (int i = node.getNCoefs(); i < this->getNCoefs(); i++) this->coefs[i] = 0.0;
@@ -236,7 +236,7 @@ template <int D> void MWNode<D>::getCoefs(Eigen::VectorXd &c) const {
     c = VectorXd::Map(this->coefs, this->n_coefs);
 }
 
-/** @brief sets all MW coefficients and the norms to zero 
+/** @brief sets all MW coefficients and the norms to zero
  *
  */
 template <int D> void MWNode<D>::zeroCoefs() {
@@ -388,7 +388,7 @@ template <int D> void MWNode<D>::giveParentCoefs(bool overwrite) {
 }
 
 /** @brief Copy scaling coefficients from children to parent
- * 
+ *
  * @details Takes the scaling coefficients of the children and stores
  * them consecutively in the corresponding block of the parent,
  * following the usual bitwise notation.
@@ -404,10 +404,12 @@ template <int D> void MWNode<D>::copyCoefsFromChildren() {
 }
 
 /** @brief Generates scaling cofficients of children
- * 
+ *
  * @details If the node is a leafNode, it takes the scaling&wavelet
  * coefficients of the parent and it generates the scaling
- * coefficients for the children
+ * coefficients for the children and stores
+ * them consecutively in the corresponding block of the parent,
+ * following the usual bitwise notation.
  */
 template <int D> void MWNode<D>::threadSafeGenChildren() {
     if (tree->isLocal) { NOT_IMPLEMENTED_ABORT; }
@@ -835,7 +837,7 @@ template <int D> void MWNode<D>::getPrimitiveQuadPts(MatrixXd &pts) const {
  * the set of quadrature points becomes \f$ x^\alpha_i = 2^{-n-1} (x_i + 2 l^\alpha + t^\alpha) \f$, where \f$ t^\alpha =
  * 0,1 \f$. By taking all possible \f$(k+1)^d\combinations \f$, they will
  * then define a d-dimensional grid of quadrature points for the child
- * nodes. 
+ * nodes.
  *
  */
 template <int D> void MWNode<D>::getPrimitiveChildPts(MatrixXd &pts) const {
diff --git a/src/trees/MWNode.h b/src/trees/MWNode.h
index 2b398c78c..d7d7a18a7 100644
--- a/src/trees/MWNode.h
+++ b/src/trees/MWNode.h
@@ -47,13 +47,13 @@ namespace mrcpp {
  * set thoucgh the template parameter D=1,2,3. In addition to the
  * coefficients the node contains metadata such as the scale, the
  * translation index, the norm, pointers to parent node and child
- * nodes, pointer to the corresponding MWTree etc... See memeber and
+ * nodes, pointer to the corresponding MWTree etc... See member and
  * data descriptions for details.
  * 
  */
 template <int D> class MWNode {
 public:
-    MWNode(const MWNode<D> &node, bool allocCoef = true);
+    MWNode(const MWNode<D> &node, bool allocCoef = true, bool SetCoef = true);
     MWNode<D> &operator=(const MWNode<D> &node) = delete;
     virtual ~MWNode();
 
@@ -164,6 +164,7 @@ template <int D> class MWNode {
     friend class OperatorTree;
     friend class FunctionNode<D>;
     friend class OperatorNode;
+    friend class DerivativeCalculator<D>;
 
 protected:
     MWTree<D> *tree{nullptr};    ///< Tree the node belongs to
@@ -171,7 +172,7 @@ template <int D> class MWNode {
     MWNode<D> *children[1 << D]; ///< 2^D children
 
     double squareNorm{-1.0};       ///< Squared norm of all 2^D (k+1)^D coefficients
-    double componentNorms[1 << D]; ///< Squared norms of the separeted 2^D components 
+    double componentNorms[1 << D]; ///< Squared norms of the separeted 2^D components
     double maxSquareNorm{-1.0};    ///< Largest squared norm among itself and descendants.
     double maxWSquareNorm{-1.0};   ///< Largest wavelet squared norm among itself and descendants.
                                    ///< NB: must be set before used.
diff --git a/src/trees/MWTree.h b/src/trees/MWTree.h
index 1587aa644..51cfe3eed 100644
--- a/src/trees/MWTree.h
+++ b/src/trees/MWTree.h
@@ -39,7 +39,7 @@
 namespace mrcpp {
 
 class BankAccount;
-  
+
 /** @class MWTree
  *
  * @brief Base class for Multiwavelet tree structures, such as FunctionTree and OperatorTree
diff --git a/src/trees/MultiResolutionAnalysis.cpp b/src/trees/MultiResolutionAnalysis.cpp
index 4ab7493bb..6eaabb120 100644
--- a/src/trees/MultiResolutionAnalysis.cpp
+++ b/src/trees/MultiResolutionAnalysis.cpp
@@ -32,6 +32,22 @@
 
 namespace mrcpp {
 
+/** @returns New MultiResolutionAnalysis (MRA) object
+ *
+ *  @brief Constructs a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets)
+ *
+ *  @param[in] bb: 2-element integer array [Lower, Upper] defining the bounds for a BoundingBox object representing the computational domain
+ *  @param[in] order: Maximum polynomial order of the multiwavelet basis,
+ *  immediately used in the constructor of an InterPolatingBasis object which becomes an attribute of the MRA
+ *  @param[in] maxDepth: Exponent of the node refinement in base 2, relative to root scale.
+ *  In other words, it is the maximum amount of refinement that we allow in a node, in other to avoid overflow of values.
+ *
+ *  @details Constructor of the MultiResolutionAnalysis class from scratch, without requiring any pre-existing complex structure.
+ *  The constructor calls the InterpolatingBasis basis constructor to generate the MultiWavelets basis of functions,
+ *  then the BoundingBox constructor to create the computational domain. The constructor then checks if the generated node depth, or
+ *  node refinement is beyond the root scale or the maximum depth allowed, in which case it will abort the process.
+ *  Otherwise, the process goes on to setup the filters with the class' setupFilter method.
+ */
 template <int D>
 MultiResolutionAnalysis<D>::MultiResolutionAnalysis(std::array<int, 2> bb, int order, int depth)
         : maxDepth(depth)
@@ -42,6 +58,18 @@ MultiResolutionAnalysis<D>::MultiResolutionAnalysis(std::array<int, 2> bb, int o
     setupFilter();
 }
 
+/** @returns New MultiResolutionAnalysis (MRA) object
+ *
+ *  @brief Constructs a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets) from a pre-existing BoundingBox object
+ *
+ *  @param[in] bb: BoundingBox object representing the computational domain
+ *  @param[in] order: (integer) Maximum polynomial order of the multiwavelet basis,
+ *  immediately used in the constructor of an InterPolatingBasis object which becomes an attribute of the MRA
+ *  @param[in] maxDepth: (integer) Exponent of the node refinement in base 2, relative to root scale.
+ *  In other words, it is the maximum amount of refinement that we allow in a node, in other to avoid overflow of values.
+ *
+ *  @details Constructor of the MultiResolutionAnalysis class from a BoundingBox object. For more details see the first constructor.
+ */
 template <int D>
 MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const BoundingBox<D> &bb, int order, int depth)
         : maxDepth(depth)
@@ -52,6 +80,14 @@ MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const BoundingBox<D> &bb, in
     setupFilter();
 }
 
+/** @returns New MultiResolutionAnalysis (MRA) object
+ *
+ *  @brief Copy constructor for a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets)
+ *
+ *  @param[in] mra: Pre-existing MRA object
+ *
+ *  @details Copy a MultiResolutionAnalysis object without modifying the original. For more details see the first constructor.
+ */
 template <int D>
 MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const MultiResolutionAnalysis<D> &mra)
         : maxDepth(mra.maxDepth)
@@ -63,9 +99,14 @@ MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const MultiResolutionAnalysi
 }
 
 /** @returns New MultiResolutionAnalysis object
- * @param[in] bb: Computational domain
- * @param[in] sb: Polynomial basis
+ *
+ *  @brief Constructor for a MultiResolutionAnalysis object from a pre-existing BoundingBox (computational domain) and a ScalingBasis (Multiwavelet basis) objects
+ *
+ * @param[in] bb: Computational domain as a BoundingBox object, taken by constant reference
+ * @param[in] sb: Polynomial basis (MW) as a ScalingBasis object
  * @param[in] depth: Maximum allowed resolution depth, relative to root scale
+ *
+ *  @details Creates a MRA object from pre-existing BoundingBox and ScalingBasis objects. These objects are taken as reference. For more details about the constructor itself, see the first constructor.
  */
 template <int D>
 MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const BoundingBox<D> &bb, const ScalingBasis &sb, int depth)
@@ -77,6 +118,16 @@ MultiResolutionAnalysis<D>::MultiResolutionAnalysis(const BoundingBox<D> &bb, co
     setupFilter();
 }
 
+/** @returns Whether the two MRA objects are equal.
+ *
+ *  @brief Equality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis, computational domain and maximum depth, and false otherwise
+ *
+ * @param[in] mra: MRA object, taken by constant reference
+ *
+ *  @details Equality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis represented by a BoundingBox object, computational domain (ScalingBasis object) and maximum depth (integer), and false otherwise.
+ *  Computations on different MRA cannot be combined, this operator can be used to make sure that the multiple MRAs are compatible.
+ *  For more information about the meaning of equality for BoundingBox and ScalingBasis objets, see their respective classes.
+ */
 template <int D> bool MultiResolutionAnalysis<D>::operator==(const MultiResolutionAnalysis<D> &mra) const {
     if (this->basis != mra.basis) return false;
     if (this->world != mra.world) return false;
@@ -84,13 +135,28 @@ template <int D> bool MultiResolutionAnalysis<D>::operator==(const MultiResoluti
     return true;
 }
 
+/** @returns Whether the two MRA objects are not equal.
+ *
+ *  @brief Inequality operator for the MultiResolutionAnalysis class, returns false if both MRAs have the same polynomial basis, computational domain and maximum depth, and true otherwise
+ *
+ * @param[in] mra: MRA object, taken by constant reference
+ *
+ *  @details Inequality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis represented by a BoundingBox object, computational domain (ScalingBasis object) and maximum depth (integer), and false otherwise.
+ *  Opposite of the == operator.
+ *  For more information about the meaning of equality for BoundingBox and ScalingBasis objets, see their respective classes.
+ */
 template <int D> bool MultiResolutionAnalysis<D>::operator!=(const MultiResolutionAnalysis<D> &mra) const {
-    if (this->basis != mra.basis) return true;
-    if (this->world != mra.world) return true;
-    if (this->maxDepth != mra.maxDepth) return true;
-    return false;
+    return !(*this == mra);
 }
 
+/**
+ *
+ *  @brief Displays the MRA's attributes in the outstream defined in the Printer class
+ *
+ *  @details This function displays the attributes of the MRA in the using the Printer class.
+ *  By default, the Printer class writes all information in the output file, not the terminal.
+ *
+ */
 template <int D> void MultiResolutionAnalysis<D>::print() const {
     print::separator(0, ' ');
     print::header(0, "MultiResolution Analysis");
@@ -100,6 +166,15 @@ template <int D> void MultiResolutionAnalysis<D>::print() const {
     print::separator(0, '=', 2);
 }
 
+/**
+ *
+ *  @brief Initializes the MW filters for the given MW basis.
+ *
+ *  @details By calling the get() function for the appropriate MW basis, the global
+ *  FilterCache Singleton object is initialized. Any subsequent reference to this
+ *  particular filter will point to the same unique global object.
+ *
+ */
 template <int D> void MultiResolutionAnalysis<D>::setupFilter() {
     getLegendreFilterCache(lfilters);
     getInterpolatingFilterCache(ifilters);
@@ -117,6 +192,11 @@ template <int D> void MultiResolutionAnalysis<D>::setupFilter() {
     }
 }
 
+/** @returns Maximum possible distance between two points in the MRA domain
+ *
+ *  @brief Computes the difference between the lower and upper bounds of the computational domain
+ *
+ */
 template <int D> double MultiResolutionAnalysis<D>::calcMaxDistance() const {
     const Coord<D> &lb = getWorldBox().getLowerBounds();
     const Coord<D> &ub = getWorldBox().getUpperBounds();
diff --git a/src/trees/NodeAllocator.cpp b/src/trees/NodeAllocator.cpp
index 65d76b940..9ca79f0b4 100644
--- a/src/trees/NodeAllocator.cpp
+++ b/src/trees/NodeAllocator.cpp
@@ -253,6 +253,7 @@ template <int D> int NodeAllocator<D>::deleteUnusedChunks() {
     // number of occupied chunks
     int nChunksTotal = getNChunks();
     int nChunksUsed = getNChunksUsed();
+    if(nChunksTotal == nChunksUsed) return 0; // no unused chunks
     assert(nChunksTotal >= nChunksUsed);
     for (int i = nChunksUsed; i < nChunksTotal; i++) delete[](char *)(this->nodeChunks[i]);
 
diff --git a/src/trees/NodeAllocator.h b/src/trees/NodeAllocator.h
index d16f7a22f..38e4ba7eb 100644
--- a/src/trees/NodeAllocator.h
+++ b/src/trees/NodeAllocator.h
@@ -56,6 +56,7 @@ template <int D> class NodeAllocator final {
 
     int compress();
     void reassemble();
+    int deleteUnusedChunks();
 
     int getNNodes() const { return this->nNodes; }
     int getNCoefs() const { return this->coefsPerNode; }
@@ -97,7 +98,6 @@ template <int D> class NodeAllocator final {
 
     void moveNodes(int nNodes, int srcIdx, int dstIdx);
     void appendChunk(bool coefs);
-    int deleteUnusedChunks();
 
     int findNextAvailable(int sIdx, int nNodes) const;
     int findNextOccupied(int sIdx) const;
diff --git a/src/utils/Bank.cpp b/src/utils/Bank.cpp
index a70385136..a774c44ff 100644
--- a/src/utils/Bank.cpp
+++ b/src/utils/Bank.cpp
@@ -16,18 +16,49 @@ Bank::~Bank() {
 }
 
 struct Blockdata_struct {
-    std::vector<double *> data; // to store the incoming data
-    std::vector<bool> deleted;  // to indicate if it has been deleted already
-    MatrixXd BlockData;         // to put the final block
-    // eigen matrix are per default stored column-major (one can add columns at the end)
-    std::vector<int> N_rows;
+    std::vector<double *> data; // to store the incoming data. One column for each orbital on the same node.
+    int N_rows = 0; // the number of coefficients in one column of the block.
     std::map<int, int> id2data; // internal index of the data in the block
     std::vector<int> id;        // the id of each column. Either nodeid, or orbid
 };
-std::map<int, std::map<int, Blockdata_struct *> *> get_nodeid2block; // to get block from its nodeid (all coeff for one node)
-std::map<int, std::map<int, Blockdata_struct *> *> get_orbid2block;  // to get block from its orbid
+struct OrbBlock_struct {
+    std::vector<double *> data; // pointer to the data
+    std::map<int, int> id2data; // internal index of the data in the block
+    std::vector<int> id;        // the nodeid of the data
+    // note that N_rows can be different inside the same orbblock: root node have scaling and wavelets, other nodes have only wavelets
+};
+struct mem_struct {
+    std::vector<double *> chunk_p; // vector with allocated chunks
+    int p = -1; // position of next available memory (not allocated if < 0)
+    //on Betzy 1024*1024*4 ok, 1024*1024*2 NOT ok: leads to memory fragmentation (on "Betzy" 2023)
+    int chunk_size = 1024*1024*4; // chunksize (in number of doubles). data_p[i]+chunk_size is end of chunk i
+    int account=-1;
+    double * get_mem(int size){
+        if(p<0 or size > chunk_size or p + size > chunk_size){ //allocate new chunk of memory
+            if(size > 1024*1024){
+                //make a special chunk just for this
+                double * m_p = new double[size];
+                chunk_p.push_back(m_p);
+                p=-1;
+                return m_p;
+            } else {
+                double * m_p = new double[chunk_size];
+                chunk_p.push_back(m_p);
+                p=0;
+            }
+        }
+        double * m_p =  chunk_p[chunk_p.size()-1] + p;
+        p += size;
+        return m_p;
+    }
+};
+std::map<int, std::map<int, Blockdata_struct> *> get_nodeid2block; // to get block from its nodeid (all coeff for one node)
+std::map<int, std::map<int, OrbBlock_struct> *> get_orbid2block;  // to get block from its orbid
+
+std::map<int, mem_struct*> mem;
 
 int const MIN_SCALE = -999; // Smaller than smallest scale
+int naccounts = 0;
 
 void Bank::open() {
 #ifdef MRCPP_HAS_MPI
@@ -67,9 +98,10 @@ void Bank::open() {
             continue;
         } else if (message == NEW_ACCOUNT) {
             // we just have to pick out a number that is not already assigned
-            int account = (max_account_id + 1) % 1000000000 + 1;
+            int account = (max_account_id + 1) % 1000000000;
             while (get_deposits.count(account)) account = (account + 1) % 1000000000; // improbable this is used
             max_account_id = account;
+            naccounts++;
             // create default content
             get_deposits[account] = new std::vector<deposit>;
             get_deposits[account]->resize(1);
@@ -77,11 +109,13 @@ void Bank::open() {
             get_id2qu[account] = new std::map<int, int>;
             get_queue[account] = new std::vector<queue_struct>;
             get_queue[account]->resize(1);
-            get_orbid2block[account] = new std::map<int, Blockdata_struct *>;
-            get_nodeid2block[account] = new std::map<int, Blockdata_struct *>;
+            get_orbid2block[account] = new std::map<int, OrbBlock_struct>;
+            get_nodeid2block[account] = new std::map<int, Blockdata_struct>;
             get_numberofclients[account] = messages[1];
             get_readytasks[account] = new std::map<int, std::vector<int>>;
             currentsize[account] = 0;
+            mem[account] = new mem_struct;
+            mem[account]->account=account;
             MPI_Send(&account, 1, MPI_INT, status.MPI_SOURCE, 1, comm_bank);
             continue;
         }
@@ -98,8 +132,8 @@ void Bank::open() {
         std::map<int, int> &id2ix = *get_id2ix[account]; // gives zero if id is not defined
         std::map<int, int> &id2qu = *get_id2qu[account];
         std::vector<queue_struct> &queue = *get_queue[account];
-        std::map<int, Blockdata_struct *> &orbid2block = *get_orbid2block[account];
-        std::map<int, Blockdata_struct *> &nodeid2block = *get_nodeid2block[account];
+        std::map<int, OrbBlock_struct> &orbid2block = *get_orbid2block[account];
+        std::map<int, Blockdata_struct> &nodeid2block = *get_nodeid2block[account];
         auto it_tasks = get_readytasks.find(account);
         if (it_tasks == get_readytasks.end() || it_tasks->second == nullptr) {
             cout << "ERROR, my account does not exist!! " << account << " " << message << endl;
@@ -111,7 +145,6 @@ void Bank::open() {
             get_numberofclients[account]--;
             if (get_numberofclients[account] == 0) {
                 // all clients have closed the account. We remove the account.
-                totcurrentsize -= currentsize[account];
                 remove_account(account);
             }
         }
@@ -119,116 +152,51 @@ void Bank::open() {
         else if (message == CLEAR_BANK) {
             this->clear_bank();
             for (auto const &block : nodeid2block) {
-                if (block.second == nullptr) continue;
-                for (int i = 0; i < block.second->data.size(); i++) {
-                    if (not block.second->deleted[i]) {
-                        currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB
-                        totcurrentsize -= block.second->N_rows[i] / 128;       // converted into kB
-                        delete[] block.second->data[i];
-                    }
+                if (block.second.data.size() > 0) {
+                    currentsize[account] -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB
+                    totcurrentsize -= block.second.N_rows * block.second.data.size()/ 128;       // converted into kB
                 }
-                delete block.second;
             }
             nodeid2block.clear();
             orbid2block.clear();
             // send message that it is ready (value of message is not used)
             MPI_Ssend(&message, 1, MPI_INT, status.MPI_SOURCE, 77, comm_bank);
-        } else if (message == CLEAR_BLOCKS) {
-            // clear only blocks whith id less than idmax.
-            int idmax = messages[2];
-            std::vector<int> toeraseVec; // it is dangerous to erase an iterator within its own loop
-            for (auto const &block : nodeid2block) {
-                if (block.second == nullptr) toeraseVec.push_back(block.first);
-                if (block.second == nullptr) continue;
-                if (block.first >= idmax and idmax != 0) continue;
-                for (int i = 0; i < block.second->data.size(); i++) {
-                    if (not block.second->deleted[i]) {
-                        currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB
-                        totcurrentsize -= block.second->N_rows[i] / 128;       // converted into kB
-                        delete[] block.second->data[i];
-                    }
-                }
-                currentsize[account] -= block.second->BlockData.size() / 128; // converted into kB
-                totcurrentsize -= block.second->BlockData.size() / 128;       // converted into kB
-                block.second->BlockData.resize(0, 0);                         // NB: the matrix does not clear itself otherwise
-                // assert(currentsize[account] >= 0);
-                this->currentsize[account] = std::max(0ll, currentsize[account]);
-                toeraseVec.push_back(block.first);
-            }
-            for (int ierase : toeraseVec) { nodeid2block.erase(ierase); }
-            toeraseVec.clear();
-            std::vector<int> datatoeraseVec;
-            for (auto const &block : orbid2block) {
-                if (block.second == nullptr) toeraseVec.push_back(block.first);
-                if (block.second == nullptr) continue;
-                datatoeraseVec.clear();
-                for (int i = 0; i < block.second->data.size(); i++) {
-                    if (block.second->id[i] < idmax or idmax == 0) datatoeraseVec.push_back(i);
-                    if (block.second->id[i] < idmax or idmax == 0) block.second->data[i] = nullptr;
-                }
-                std::sort(datatoeraseVec.begin(), datatoeraseVec.end());
-                std::reverse(datatoeraseVec.begin(), datatoeraseVec.end());
-                for (int ierase : datatoeraseVec) {
-                    block.second->id.erase(block.second->id.begin() + ierase);
-                    block.second->data.erase(block.second->data.begin() + ierase);
-                    block.second->N_rows.erase(block.second->N_rows.begin() + ierase);
-                }
-                if (block.second->data.size() == 0) toeraseVec.push_back(block.first);
-            }
-            for (int ierase : toeraseVec) { orbid2block.erase(ierase); }
-
-            if (idmax == 0) orbid2block.clear();
-            // could have own clear for data?
-            for (int ix = 1; ix < deposits.size(); ix++) {
-                if (deposits[ix].hasdata) delete deposits[ix].data;
-                if (deposits[ix].hasdata) id2ix[deposits[ix].id] = 0; // indicate that it does not exist
-                deposits[ix].hasdata = false;
-            }
-            // send message that it is ready (value of message is not used)
-            MPI_Ssend(&message, 1, MPI_INT, status.MPI_SOURCE, 78, comm_bank);
         }
 
         else if (message == GET_NODEDATA or message == GET_NODEBLOCK) {
             // NB: has no queue system yet
             int nodeid = messages[2]; // which block to fetch from
-            if (nodeid2block.count(nodeid) and nodeid2block[nodeid] != nullptr) {
-                Blockdata_struct *block = nodeid2block[nodeid];
+            if (nodeid2block.count(nodeid)) {
+                Blockdata_struct &block = nodeid2block[nodeid];
                 int dataindex = 0; // internal index of the data in the block
                 int size = 0;
                 if (message == GET_NODEDATA) {
                     int orbid = messages[3];           // which part of the block to fetch
-                    dataindex = block->id2data[orbid]; // column of the data in the block
-                    size = block->N_rows[dataindex];   // number of doubles to fetch
+                    dataindex = block.id2data[orbid]; // column of the data in the block
+                    size = block.N_rows;   // number of doubles to fetch
                     if (size != messages[4]) std::cout << "ERROR nodedata has wrong size" << std::endl;
+                    double *data_p = block.data[dataindex];
+                    if (size > 0) MPI_Send(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank);
                 } else {
                     // send entire block. First make one contiguous superblock
                     // Prepare the data as one contiguous block
-                    if (block->data.size() == 0) std::cout << "Zero size blockdata! " << nodeid << " " << block->N_rows.size() << std::endl;
-                    block->BlockData.resize(block->N_rows[0], block->data.size());
-                    size = block->N_rows[0] * block->data.size();
-                    if (printinfo) std::cout << " rewrite into superblock " << block->data.size() << " " << block->N_rows[0] << " nodeid " << nodeid << std::endl;
-                    for (int j = 0; j < block->data.size(); j++) {
-                        for (int i = 0; i < block->N_rows[j]; i++) { block->BlockData(i, j) = block->data[j][i]; }
-                    }
-                    // repoint to the data in BlockData
-                    for (int j = 0; j < block->data.size(); j++) {
-                        if (block->deleted[j] == true) std::cout << "ERROR data already deleted " << std::endl;
-                        assert(block->deleted[j] == false);
-                        delete[] block->data[j];
-                        block->deleted[j] = true;
-                        block->data[j] = block->BlockData.col(j).data();
-                    }
+                    if (block.data.size() == 0) std::cout << "Zero size blockdata! " << nodeid << " " << block.N_rows << std::endl;
+                    MatrixXd DataBlock(block.N_rows, block.data.size());
+                    size = block.N_rows * block.data.size();
+                    if (printinfo) std::cout << " rewrite into superblock " << block.data.size() << " " << block.N_rows << " nodeid " << nodeid << std::endl;
+                    for (int j = 0; j < block.data.size(); j++) {
+                        for (int i = 0; i < block.N_rows; i++) { DataBlock(i, j) = block.data[j][i]; }
+                   }
                     dataindex = 0; // start from first column
                     // send info about the size of the superblock
                     metadata_block[0] = nodeid;             // nodeid
-                    metadata_block[1] = block->data.size(); // number of columns
+                    metadata_block[1] = block.data.size(); // number of columns
                     metadata_block[2] = size;               // total size = rows*columns
                     MPI_Send(metadata_block, size_metadata, MPI_INT, status.MPI_SOURCE, 1, comm_bank);
                     // send info about the id of each column
-                    MPI_Send(block->id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank);
+                    MPI_Send(block.id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank);
+                    if (size > 0) MPI_Send(DataBlock.data(), size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank);
                 }
-                double *data_p = block->data[dataindex];
-                if (size > 0) MPI_Send(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank);
             } else {
                 if (printinfo) std::cout << " block " << nodeid << " does not exist " << std::endl;
                 // Block with this id does not exist.
@@ -253,27 +221,30 @@ void Bank::open() {
             // NB: BLOCKDATA has no queue system yet
             int orbid = messages[2]; // which block to fetch from
 
-            if (orbid2block.count(orbid) and orbid2block[orbid] != nullptr) {
-                Blockdata_struct *block = orbid2block[orbid];
-                int dataindex = 0; // internal index of the data in the block
-                int size = 0;
+            if (orbid2block.count(orbid)) {
+                OrbBlock_struct &block = orbid2block[orbid];
+                if (block.data.size() == 0) std::cout << "Zero size blockdata! C " << orbid << " " << std::endl;
                 // send entire block. First make one contiguous superblock
                 // Prepare the data as one contiguous block
-                if (block->data.size() == 0) std::cout << "Zero size blockdata! C " << orbid << " " << block->N_rows.size() << std::endl;
-                size = 0;
-                for (int j = 0; j < block->data.size(); j++) size += block->N_rows[j];
-
+                int size = 0;
+                for (int j = 0; j < block.data.size(); j++) {
+                    int nodeid = block.id[j];
+                    int Nrows = nodeid2block[nodeid].N_rows; // note that root nodes have scaling and wavelets, while other nodes have only wavelets -> N_rows is not a constant.
+                    size += Nrows;
+                }
                 std::vector<double> coeff(size);
                 int ij = 0;
-                for (int j = 0; j < block->data.size(); j++) {
-                    for (int i = 0; i < block->N_rows[j]; i++) { coeff[ij++] = block->data[j][i]; }
+                for (int j = 0; j < block.data.size(); j++) {
+                    int nodeid = block.id[j];
+                    int Nrows = nodeid2block[nodeid].N_rows;
+                    for (int i = 0; i < Nrows; i++) { coeff[ij++] = block.data[j][i]; }
                 }
                 // send info about the size of the superblock
                 metadata_block[0] = orbid;
-                metadata_block[1] = block->data.size(); // number of columns
+                metadata_block[1] = block.data.size(); // number of columns
                 metadata_block[2] = size;               // total size = rows*columns
                 MPI_Send(metadata_block, size_metadata, MPI_INT, status.MPI_SOURCE, 1, comm_bank);
-                MPI_Send(block->id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank);
+                MPI_Send(block.id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank);
                 MPI_Send(coeff.data(), size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank);
             } else {
                 // it is possible and allowed that the block has not been written
@@ -336,7 +307,6 @@ void Bank::open() {
                         id2ix[id] = 0;
                     }
                 }
-                // if (message == GET_FUNCTION) { send_function(*deposits[ix].orb, status.MPI_SOURCE, 1, comm_bank); }
                 if (message == GET_DATA) { MPI_Send(deposits[ix].data, deposits[ix].datasize, MPI_DOUBLE, status.MPI_SOURCE, 1, comm_bank); }
             }
         } else if (message == SAVE_NODEDATA) {
@@ -346,38 +316,26 @@ void Bank::open() {
 
             // test if the block exists already
             if (printinfo) std::cout << world_rank << " save data nodeid " << nodeid << " size " << size << std::endl;
-            if (nodeid2block.count(nodeid) == 0 or nodeid2block[nodeid] == nullptr) {
-                if (printinfo) std::cout << world_rank << " block does not exist yet  " << std::endl;
-                // the block does not exist yet, create it
-                Blockdata_struct *block = new Blockdata_struct;
-                nodeid2block[nodeid] = block;
-            }
-            if (orbid2block.count(orbid) == 0 or orbid2block[orbid] == nullptr) {
-                // the block does not exist yet, create it
-                Blockdata_struct *orbblock = new Blockdata_struct;
-                orbid2block[orbid] = orbblock;
-            }
             // append the incoming data
-            Blockdata_struct *block = nodeid2block[nodeid];
-            block->id2data[orbid] = nodeid2block[nodeid]->data.size(); // internal index of the data in the block
-            double *data_p = new double[size];
+            Blockdata_struct &block = nodeid2block[nodeid];
+            block.id2data[orbid] = nodeid2block[nodeid].data.size(); // internal index of the data in the block
+            double *data_p = mem[account]->get_mem(size);//new double[size];
             currentsize[account] += size / 128; // converted into kB
             totcurrentsize += size / 128;       // converted into kB
             this->maxsize = std::max(totcurrentsize, this->maxsize);
-            block->data.push_back(data_p);
-            block->deleted.push_back(false);
-            block->id.push_back(orbid);
-            block->N_rows.push_back(size);
+            block.data.push_back(data_p);
+            block.id.push_back(orbid);
+            if (block.N_rows > 0 and block.N_rows != size) cout<<" ERROR block size incompatible " <<block.N_rows <<" "<< size<<endl;
+            block.N_rows = size;
 
-            Blockdata_struct *orbblock = orbid2block[orbid];
-            orbblock->id2data[nodeid] = orbblock->data.size(); // internal index of the data in the block
-            orbblock->data.push_back(data_p);
-            orbblock->deleted.push_back(false);
-            orbblock->id.push_back(nodeid);
-            orbblock->N_rows.push_back(size);
+            OrbBlock_struct &orbblock = orbid2block[orbid];
+            orbblock.id2data[nodeid] = orbblock.data.size(); // internal index of the data in the block
+            orbblock.data.push_back(data_p);
+            orbblock.id.push_back(nodeid);
+            //orbblock.N_rows.push_back(size);
 
             MPI_Recv(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 1, comm_bank, &status);
-            if (printinfo) std::cout << " written block " << nodeid << " id " << orbid << " subblocks " << nodeid2block[nodeid]->data.size() << std::endl;
+            if (printinfo) std::cout << " written block " << nodeid << " id " << orbid << " subblocks " << nodeid2block[nodeid].data.size() << std::endl;
         } else if (message == SAVE_FUNCTION or message == SAVE_DATA) {
             // make a new deposit
             int id = messages[2];
@@ -402,7 +360,11 @@ void Bank::open() {
                 if (message == SAVE_DATA and !deposits[ix].hasdata) {
                     datasize = messages[3];
                     exist_flag = 0;
-                    deposits[ix].data = new double[datasize];
+                    //                    deposits[ix].data = new double[datasize];
+                    deposits[ix].data = mem[account]->get_mem(datasize);
+                    currentsize[account] += datasize / 128; // converted into kB
+                    totcurrentsize += datasize / 128;       // converted into kB
+                    this->maxsize = std::max(totcurrentsize, this->maxsize);
                     deposits[ix].hasdata = true;
                 }
             } else {
@@ -411,7 +373,10 @@ void Bank::open() {
                 if (message == SAVE_FUNCTION) deposits[ix].orb = new ComplexFunction(0);
                 if (message == SAVE_DATA) {
                     datasize = messages[3];
-                    deposits[ix].data = new double[datasize];
+                    deposits[ix].data = mem[account]->get_mem(datasize);//new double[datasize];
+                    currentsize[account] += datasize / 128; // converted into kB
+                    totcurrentsize += datasize / 128;       // converted into kB
+                    this->maxsize = std::max(totcurrentsize, this->maxsize);
                     deposits[ix].hasdata = true;
                 }
             }
@@ -420,6 +385,7 @@ void Bank::open() {
             deposits[ix].source = status.MPI_SOURCE;
             if (message == SAVE_FUNCTION) {
                 recv_function(*deposits[ix].orb, deposits[ix].source, 1, comm_bank);
+                cout<<"recv ORB size "<<deposits[ix].orb->getSizeNodes(NUMBER::Total)<<endl;
                 if (exist_flag == 0) {
                     currentsize[account] += deposits[ix].orb->getSizeNodes(NUMBER::Total);
                     totcurrentsize += deposits[ix].orb->getSizeNodes(NUMBER::Total);
@@ -430,9 +396,6 @@ void Bank::open() {
                 datasize = messages[3];
                 deposits[ix].datasize = datasize;
                 MPI_Recv(deposits[ix].data, datasize, MPI_DOUBLE, deposits[ix].source, 1, comm_bank, &status);
-                currentsize[account] += datasize / 128; // converted into kB
-                totcurrentsize += datasize / 128;       // converted into kB
-                this->maxsize = std::max(totcurrentsize, this->maxsize);
             }
             if (id2qu[deposits[ix].id] != 0) {
                 // someone is waiting for those data. Send to them
@@ -509,6 +472,7 @@ int Bank::clearAccount(int account, int iclient, MPI_Comm comm) {
 }
 void Bank::remove_account(int account) {
 #ifdef MRCPP_HAS_MPI
+    naccounts--;
     auto it = get_deposits.find(account);
     if (it == get_deposits.end() || it->second == nullptr) {
         cout << "ERROR, my account depositsdoes not exist!! " << account << " " << endl;
@@ -516,75 +480,41 @@ void Bank::remove_account(int account) {
     }
     std::vector<deposit> &deposits = *get_deposits[account];
     for (int ix = 1; ix < deposits.size(); ix++) {
-        if (deposits[ix].orb != nullptr) deposits[ix].orb->free(NUMBER::Total);
-        if (deposits[ix].hasdata) delete deposits[ix].data;
-        deposits[ix].hasdata = false;
+       if (deposits[ix].orb != nullptr) deposits[ix].orb->free(NUMBER::Total);
+       if (deposits[ix].hasdata) {
+           currentsize[account] -= deposits[ix].datasize / 128;
+           totcurrentsize -= deposits[ix].datasize / 128;
+       }
+       if (deposits[ix].hasdata) (*get_id2ix[account])[deposits[ix].id] = 0; // indicate that it does not exist
+       deposits[ix].hasdata = false;
     }
     deposits.clear();
+    get_deposits.erase(account);
     delete get_queue[account];
     get_queue.erase(account);
     delete get_id2ix[account];
-    delete get_id2qu[account];
-    delete get_readytasks[account];
     get_id2ix.erase(account);
+    delete get_id2qu[account];
     get_id2qu.erase(account);
-    get_deposits.erase(account);
-    currentsize.erase(account);
+    delete get_readytasks[account];
     get_readytasks.erase(account);
 
-    std::map<int, Blockdata_struct *> &nodeid2block = *get_nodeid2block[account];
-    std::map<int, Blockdata_struct *> &orbid2block = *get_orbid2block[account];
+    std::map<int, Blockdata_struct> &nodeid2block = *get_nodeid2block[account];
+    std::map<int, OrbBlock_struct> &orbid2block = *get_orbid2block[account];
 
-    std::vector<int> toeraseVec; // it is dangerous to erase an iterator within its own loop
     for (auto const &block : nodeid2block) {
-        if (block.second == nullptr) toeraseVec.push_back(block.first);
-        if (block.second == nullptr) continue;
-        for (int i = 0; i < block.second->data.size(); i++) {
-            if (not block.second->deleted[i]) {
-                currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB
-                totcurrentsize -= block.second->N_rows[i] / 128;       // converted into kB
-                delete[] block.second->data[i];
-            }
-        }
-        currentsize[account] -= block.second->BlockData.size() / 128; // converted into kB
-        totcurrentsize -= block.second->BlockData.size() / 128;       // converted into kB
-        block.second->BlockData.resize(0, 0);                         // NB: the matrix does not clear itself otherwise
-        // assert(currentsize[account] >= 0);
-        toeraseVec.push_back(block.first);
-    }
-    for (int ierase : toeraseVec) { nodeid2block.erase(ierase); }
-    toeraseVec.clear();
-    std::vector<int> datatoeraseVec; // it is dangerous to erase an iterator within its own loop
-    for (auto const &block : orbid2block) {
-        if (block.second == nullptr) toeraseVec.push_back(block.first);
-        if (block.second == nullptr) continue;
-        datatoeraseVec.clear();
-        for (int i = 0; i < block.second->data.size(); i++) {
-            datatoeraseVec.push_back(i);
-            block.second->data[i] = nullptr;
-        }
-        std::sort(datatoeraseVec.begin(), datatoeraseVec.end());
-        std::reverse(datatoeraseVec.begin(), datatoeraseVec.end());
-        for (int ierase : datatoeraseVec) {
-            block.second->id.erase(block.second->id.begin() + ierase);
-            block.second->data.erase(block.second->data.begin() + ierase);
-            block.second->N_rows.erase(block.second->N_rows.begin() + ierase);
-        }
-        if (block.second->data.size() == 0) toeraseVec.push_back(block.first);
+        currentsize[account] -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB
+        totcurrentsize -= block.second.N_rows * block.second.data.size()/ 128;       // converted into kB
     }
-    for (int ierase : toeraseVec) { orbid2block.erase(ierase); }
-
+    nodeid2block.clear();
     orbid2block.clear();
-    for (int ix = 1; ix < deposits.size(); ix++) {
-        if (deposits[ix].hasdata) delete deposits[ix].data;
-        if (deposits[ix].hasdata) (*get_id2ix[account])[deposits[ix].id] = 0; // indicate that it does not exist
-        deposits[ix].hasdata = false;
-    }
-    delete get_nodeid2block[account];
-    delete get_orbid2block[account];
+
     get_nodeid2block.erase(account);
     get_orbid2block.erase(account);
 
+    for (double* c_p : mem[account]->chunk_p) delete [] c_p;
+    mem.erase(account);
+    currentsize.erase(account);
 #endif
 }
 
@@ -804,7 +734,6 @@ int BankAccount::put_data(NodeIndex<3> nIdx, int size, double *data) {
     messages[5] = nIdx.getTranslation(1);
     messages[6] = nIdx.getTranslation(2);
     int id = std::abs(nIdx.getTranslation(0) + nIdx.getTranslation(1) + nIdx.getTranslation(2));
-    //    std::cout<<mpi::wrk_rank<<" bankidx "<<bank_size<<" ID "<<messages[4]<<" "<<messages[2]<<" "<<messages[5]<<" "<<messages[6]<<std::endl;
     MPI_Send(messages, 7, MPI_INT, bankmaster[id % bank_size], 0, comm_bank);
     MPI_Send(data, size, MPI_DOUBLE, bankmaster[id % bank_size], 1, comm_bank);
 #endif
@@ -878,7 +807,7 @@ int BankAccount::get_nodedata(int id, int nodeid, int size, double *data, std::v
     return 1;
 }
 
-// get all data for nodeid
+// get all data for nodeid (same nodeid, different orbitals)
 int BankAccount::get_nodeblock(int nodeid, double *data, std::vector<int> &idVec) {
 #ifdef MRCPP_HAS_MPI
     MPI_Status status;
@@ -898,7 +827,7 @@ int BankAccount::get_nodeblock(int nodeid, double *data, std::vector<int> &idVec
     return 1;
 }
 
-// get all data with identity orbid
+// get all data with identity orbid (same orbital, different nodes)
 int BankAccount::get_orbblock(int orbid, double *&data, std::vector<int> &nodeidVec, int bankstart) {
 #ifdef MRCPP_HAS_MPI
     MPI_Status status;
@@ -919,30 +848,6 @@ int BankAccount::get_orbblock(int orbid, double *&data, std::vector<int> &nodeid
     return 1;
 }
 
-// remove all blockdata with nodeid < nodeidmax
-// NB:: collective call. All clients must call this
-void BankAccount::clear_blockdata(int iclient, int nodeidmax, MPI_Comm comm) {
-#ifdef MRCPP_HAS_MPI
-    // 1) wait until all clients are ready
-    MPI_Barrier(comm);
-    // master send signal to bank
-    if (iclient == 0) {
-        int messages[message_size];
-        messages[0] = CLEAR_BLOCKS;
-        messages[1] = account_id;
-        messages[2] = nodeidmax;
-        for (int i = 0; i < bank_size; i++) { MPI_Send(messages, 3, MPI_INT, bankmaster[i], 0, comm_bank); }
-        for (int i = 0; i < bank_size; i++) {
-            // wait until Bank is finished and has sent signal
-            MPI_Status status;
-            int message;
-            MPI_Recv(&message, 1, MPI_INT, bankmaster[i], 78, comm_bank, &status);
-        }
-    }
-    MPI_Barrier(comm);
-#endif
-}
-
 // creator. NB: collective
 BankAccount::BankAccount(int iclient, MPI_Comm comm) {
     this->account_id = dataBank.openAccount(iclient, comm);
diff --git a/src/utils/Bank.h b/src/utils/Bank.h
index 18cba950f..501faa7a0 100644
--- a/src/utils/Bank.h
+++ b/src/utils/Bank.h
@@ -107,7 +107,6 @@ class BankAccount {
     int get_nodedata(int id, int nodeid, int size, double *data, std::vector<int> &idVec);
     int get_nodeblock(int nodeid, double *data, std::vector<int> &idVec);
     int get_orbblock(int orbid, double *&data, std::vector<int> &nodeidVec, int bankstart);
-    void clear_blockdata(int i = wrk_rank, int nodeidmax = 0, MPI_Comm comm = comm_wrk);
 };
 
 class TaskManager {
diff --git a/src/utils/ComplexFunction.cpp b/src/utils/ComplexFunction.cpp
index 9d65cddf0..63d855727 100644
--- a/src/utils/ComplexFunction.cpp
+++ b/src/utils/ComplexFunction.cpp
@@ -304,7 +304,7 @@ void ComplexFunction::rescale(double c) {
 /** @brief In place multiply with complex scalar. Involves a deep copy.*/
 void ComplexFunction::rescale(ComplexDouble c) {
     ComplexFunction &out = *this;
-    ComplexFunction tmp(isShared());
+    ComplexFunction tmp(spin(), occ(), rank, isShared());
     cplxfunc::deep_copy(tmp, out);
     out.free(NUMBER::Total);
     out.add(c, tmp);
@@ -370,6 +370,7 @@ ComplexDouble cplxfunc::node_norm_dot(ComplexFunction bra, ComplexFunction ket,
 void cplxfunc::deep_copy(ComplexFunction &out, ComplexFunction &inp) {
     bool need_to_copy = not(out.isShared()) or mpi::share_master();
     out.funcMRA = inp.funcMRA;
+    out.setRank(inp.getRank());
     if (inp.hasReal()) {
         if (not out.hasReal()) out.alloc(NUMBER::Real);
         if (need_to_copy) {
@@ -652,12 +653,25 @@ void cplxfunc::multiply_imag(ComplexFunction &out, ComplexFunction inp_a, Comple
 
 namespace mpifuncvec {
 
-void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
+
+/** @brief Make a linear combination of functions
+ *
+ * Uses "local" representation: treats one node at a time.
+ * For each node, all functions are transformed simultaneously
+ * by a dense matrix multiplication.
+ * Phi input functions, Psi output functions
+ *
+ */
+void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec) {
 
     // The principle of this routine is that nodes are rotated one by one using matrix multiplication.
     // The routine does avoid when possible to move data, but uses pointers and indices manipulation.
     // MPI version does not use OMP yet, Serial version uses OMP
+    // size of input is N, size of output is M
     int N = Phi.size();
+    int M = Psi.size();
+    if (U.rows() < N) MSG_ABORT("Incompatible number of rows for U matrix");
+    if (U.cols() < M) MSG_ABORT("Incompatible number of columns for U matrix");
 
     // 1) make union tree without coefficients
     FunctionTree<3> refTree(*Phi.vecMRA);
@@ -674,21 +688,21 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
     refTree.makeCoeffVector(coeffVec_ref, indexVec_ref, parindexVec_ref, scalefac_ref, max_ix, refTree);
     int max_n = indexVec_ref.size();
 
-    // 2) We work with real numbers only. Make real blocks for U matrix
+   // 2) We work with real numbers only. Make real blocks for U matrix
     bool UhasReal = false;
     bool UhasImag = false;
     for (int i = 0; i < N; i++) {
-        for (int j = 0; j < N; j++) {
-            if (std::abs(U(i, j).real()) > MachineZero) UhasReal = true;
-            if (std::abs(U(i, j).imag()) > MachineZero) UhasImag = true;
+        for (int j = 0; j < M; j++) {
+            if (std::abs(U(i, j).real()) > 10*MachineZero) UhasReal = true;
+            if (std::abs(U(i, j).imag()) > 10*MachineZero) UhasImag = true;
         }
     }
 
     IntVector PsihasReIm = IntVector::Zero(2);
-    for (int i = 0; i < N; i++) {
-        if (!mpi::my_orb(i)) continue;
-        PsihasReIm[0] = (Phi[i].hasReal()) ? 1 : 0;
-        PsihasReIm[1] = (Phi[i].hasImag()) ? 1 : 0;
+    for (int j = 0; j < N; j++) {
+        if (!mpi::my_orb(j)) continue;
+        PsihasReIm[0] = (Phi[j].hasReal()) ? 1 : 0;
+        PsihasReIm[1] = (Phi[j].hasImag()) ? 1 : 0;
     }
     mpi::allreduce_vector(PsihasReIm, mpi::comm_wrk);
     if (not PsihasReIm[0] and not PsihasReIm[1]) {
@@ -698,21 +712,23 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
     bool makeReal = (UhasReal and PsihasReIm[0]) or (UhasImag and PsihasReIm[1]);
     bool makeImag = (UhasReal and PsihasReIm[1]) or (UhasImag and PsihasReIm[0]);
 
-    for (int i = 0; i < N; i++) {
-        if (!mpi::my_orb(i)) continue;
-        if (not makeReal and Phi[i].hasReal()) Phi[i].free(NUMBER::Real);
-        if (not makeImag and Phi[i].hasImag()) Phi[i].free(NUMBER::Imag);
+    for (int j = 0; j < M; j++) {
+        if (!mpi::my_orb(j)) continue;
+        if (not makeReal and Psi[j].hasReal()) Psi[j].free(NUMBER::Real);
+        if (not makeImag and Psi[j].hasImag()) Psi[j].free(NUMBER::Imag);
     }
 
     if (not makeReal and not makeImag) { return; }
 
-    int Neff = N;               // effective number of orbitals
+    int Neff = N;               // effective number of input orbitals
+    int Meff = M;               // effective number of output orbitals
     if (makeImag) Neff = 2 * N; // Imag and Real treated independently. We always use real part of U
+    if (makeImag) Meff = 2 * M; // Imag and Real treated independently. We always use real part of U
 
     IntVector conjMat = IntVector::Zero(Neff);
-    for (int i = 0; i < Neff; i++) {
-        if (!mpi::my_orb(i % N)) continue;
-        conjMat[i] = (Phi[i % N].conjugate()) ? -1 : 1;
+    for (int j = 0; j < Neff; j++) {
+        if (!mpi::my_orb(j % N)) continue;
+        conjMat[j] = (Phi[j % N].conjugate()) ? -1 : 1;
     }
     mpi::allreduce_vector(conjMat, mpi::comm_wrk);
 
@@ -720,22 +736,22 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
     // out_r = U_rr*in_r - U_ir*in_i*conjMat
     // out_i = U_ri*in_r - U_ii*in_i*conjMat
     // the first index of U is the one used on input Phi
-    DoubleMatrix Ureal(Neff, Neff); // four blocks, for rr ri ir ii
-    for (int i = 0; i < Neff; i++) {
-        for (int j = 0; j < Neff; j++) {
+    DoubleMatrix Ureal(Neff, Meff); // four blocks, for rr ri ir ii
+    for (int j = 0; j < Neff; j++) {
+        for (int i = 0; i < Meff; i++) {
             double sign = 1.0;
-            if (i < N and j < N) {
+            if (j < N and i < M) {
                 // real U applied on real Phi
-                Ureal(i, j) = U.real()(i % N, j % N);
-            } else if (i >= N and j >= N) {
+                Ureal(j, i) = U.real()(j % N, i % M);
+            } else if (j >= N and i >= M) {
                 // real U applied on imag Phi
-                Ureal(i, j) = conjMat[i] * U.real()(i % N, j % N);
-            } else if (i < N and j >= N) {
+                Ureal(j, i) = conjMat[j] * U.real()(j % N, i % M);
+            } else if (j < N and i >= M) {
                 // imag U applied on real Phi
-                Ureal(i, j) = U.imag()(i % N, j % N);
+                Ureal(j, i) = U.imag()(j % N, i % M);
             } else {
                 // imag U applied on imag Phi
-                Ureal(i, j) = -1.0 * conjMat[i] * U.imag()(i % N, j % N);
+                Ureal(j, i) = -1.0 * conjMat[j] * U.imag()(j % N, i % M);
             }
         }
     }
@@ -788,21 +804,19 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
     }
 
     // 4) rotate all the nodes
-    IntMatrix split_serial;                             // in the serial case all split are store in one array
-    std::vector<double> split(Neff, -1.0);              // which orbitals need splitting (at a given node). For now double for compatibilty with bank
-    std::vector<double> needsplit(Neff, 1.0);           // which orbitals need splitting
-    std::vector<std::vector<double *>> coeffpVec(Neff); // to put pointers to the rotated coefficient for each orbital in serial case
-    std::vector<std::map<int, int>> ix2coef(Neff);      // to find the index in for example rotCoeffVec[] corresponding to a serialIx
+    IntMatrix split_serial;                             // in the serial case all split are stored in one array
+    std::vector<std::vector<double *>> coeffpVec(Meff); // to put pointers to the rotated coefficient for each orbital in serial case
+    std::vector<std::map<int, int>> ix2coef(Meff);      // to find the index in for example rotCoeffVec[] corresponding to a serialIx
     int csize;                                          // size of the current coefficients (different for roots and branches)
     std::vector<DoubleMatrix> rotatedCoeffVec;          // just to ensure that the data from rotatedCoeff is not deleted, since we point to it.
     // j indices are for unrotated orbitals, i indices are for rotated orbitals
     if (serial) {
         std::map<int, int> ix2coef_ref;   // to find the index n corresponding to a serialIx
-        split_serial.resize(Neff, max_n); // not use in the MPI case
+        split_serial.resize(Meff, max_n); // not use in the MPI case
         for (int n = 0; n < max_n; n++) {
             int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree
             ix2coef_ref[node_ix] = n;
-            for (int i = 0; i < Neff; i++) split_serial(i, n) = 1;
+            for (int i = 0; i < Meff; i++) split_serial(i, n) = 1;
         }
 
         std::vector<int> nodeReady(max_n, 0); // To indicate to OMP threads that the parent is ready (for splits)
@@ -836,9 +850,9 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
             };
 
             std::vector<int> orbiVec;
-            for (int i = 0; i < Neff; i++) { // loop over all rotated orbitals
-                if (not makeReal and i < N) continue;
-                if (not makeImag and i >= N) continue;
+            for (int i = 0; i < Meff; i++) { // loop over all rotated orbitals
+                if (not makeReal and i < M) continue;
+                if (not makeImag and i >= M) continue;
                 if (parindexVec_ref[n] >= 0 and split_serial(i, ix2coef_ref[parindexVec_ref[n]]) == 0) continue; // parent node has too small wavelets
                 orbiVec.push_back(i);
             }
@@ -886,12 +900,11 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
                 rotatedCoeffVec.push_back(std::move(rotatedCoeff));
             }
         }
-
     } else { // MPI case
 
         // TODO? rotate in bank, so that we do not get and put. Requires clever handling of splits.
-        std::vector<double> split(Neff, -1.0);    // which orbitals need splitting (at a given node). For now double for compatibilty with bank
-        std::vector<double> needsplit(Neff, 1.0); // which orbitals need splitting
+        std::vector<double> split(Meff, -1.0);    // which orbitals need splitting (at a given node). For now double for compatibilty with bank
+        std::vector<double> needsplit(Meff, 1.0); // which orbitals need splitting
         BankAccount nodeSplits;
         mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet.
 
@@ -906,13 +919,13 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
             int parentid = parindexVec_ref[n];
             if (parentid == -1) {
                 // root node, split if output needed
-                for (int i = 0; i < N; i++) {
+                for (int i = 0; i < M; i++) {
                     if (makeReal)
                         split[i] = 1.0;
                     else
                         split[i] = -1.0;
                 }
-                for (int i = N; i < Neff; i++) {
+                for (int i = N; i < Meff; i++) {
                     if (makeImag)
                         split[i] = 1.0;
                     else
@@ -921,12 +934,12 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
                 csize = sizecoeff;
             } else {
                 // note that it will wait until data is available
-                nodeSplits.get_data(parentid, Neff, split.data());
+                nodeSplits.get_data(parentid, Meff, split.data());
                 csize = sizecoeffW;
             }
             std::vector<int> orbiVec;
             std::vector<int> orbjVec;
-            for (int i = 0; i < Neff; i++) {  // loop over rotated orbitals
+            for (int i = 0; i < Meff; i++) {  // loop over rotated orbitals
                 if (split[i] < 0.0) continue; // parent node has too small wavelets
                 orbiVec.push_back(i);
             }
@@ -960,7 +973,7 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
                 if (thres < wnorm or prec < 0) needsplit[orbiVec[i]] = 1.0;
                 nodesRotated.put_nodedata(orbiVec[i], indexVec_ref[n] + max_ix, csize, rotatedCoeff.col(i).data());
             }
-            nodeSplits.put_data(indexVec_ref[n], Neff, needsplit.data());
+            nodeSplits.put_data(indexVec_ref[n], Meff, needsplit.data());
         }
         mpi::barrier(mpi::comm_wrk); // wait until all rotated nodes are ready
     }
@@ -973,22 +986,23 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
         // operation is writing the coefficient into the tree)
 
 #pragma omp parallel for schedule(static)
-        for (int j = 0; j < Neff; j++) {
-            if (j < N) {
-                if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real);
-                Phi[j].real().clear();
-                Phi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
+        for (int j = 0; j < Meff; j++) {
+            if (coeffpVec[j].size()==0) continue;
+            if (j < M) {
+                if (!Psi[j].hasReal()) Psi[j].alloc(NUMBER::Real);
+                Psi[j].real().clear();
+                Psi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
             } else {
-                if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag);
-                Phi[j % N].imag().clear();
-                Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
+                if (!Psi[j % M].hasImag()) Psi[j % M].alloc(NUMBER::Imag);
+                Psi[j % M].imag().clear();
+                Psi[j % M].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
             }
         }
 
     } else { // MPI case
 
-        for (int j = 0; j < Neff; j++) {
-            if (not mpi::my_orb(j % N)) continue;
+        for (int j = 0; j < Meff; j++) {
+            if (not mpi::my_orb(j % M)) continue;
             // traverse possible nodes, and stop descending when norm is zero (leaf in out[j])
             std::vector<double *> coeffpVec; //
             std::map<int, int> ix2coef;      // to find the index in coeffVec[] corresponding to a serialIx
@@ -1010,16 +1024,16 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
                     shift += csize;
                 }
             }
-            if (j < N) {
+            if (j < M) {
                 // Real part
-                if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real);
-                Phi[j].real().clear();
-                Phi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
+                if (!Psi[j].hasReal()) Psi[j].alloc(NUMBER::Real);
+                Psi[j].real().clear();
+                Psi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
             } else {
                 // Imag part
-                if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag);
-                Phi[j % N].imag().clear();
-                Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
+                if (!Psi[j % M].hasImag()) Psi[j % M].alloc(NUMBER::Imag);
+                Psi[j % M].imag().clear();
+                Psi[j % M].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
             }
             for (double *p : pointerstodelete) delete[] p;
             pointerstodelete.clear();
@@ -1027,6 +1041,12 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
     }
 }
 
+
+void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {
+    rotate(Phi, U, Phi, prec);
+    return;
+}
+
 /** @brief Save all nodes in bank; identify them using serialIx from refTree
  * shift is a shift applied in the id
  */
@@ -1551,10 +1571,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet) {
                 S_temp.noalias() = coeffBlock.transpose() * coeffBlock;
                 for (int i = 0; i < orbVec.size(); i++) {
                     for (int j = 0; j < orbVec.size(); j++) {
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
+                            continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
+                            continue;
                         double &Srealij = Sreal(orbVec[i], orbVec[j]);
                         double &Stempij = S_temp(i, j);
 #pragma omp atomic
@@ -1572,17 +1592,16 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet) {
                 S_temp.noalias() = coeffBlock.transpose() * coeffBlock;
                 for (int i = 0; i < orbVec.size(); i++) {
                     for (int j = 0; j < orbVec.size(); j++) {
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
+                            continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
+                            continue;
                         Sreal(orbVec[i], orbVec[j]) += S_temp(i, j);
                     }
                 }
             }
         }
     }
-
     IntVector conjMat = IntVector::Zero(N);
     for (int i = 0; i < N; i++) {
         if (!mrcpp::mpi::my_orb(BraKet[i])) continue;
@@ -1716,7 +1735,8 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
     int totget = 0;
     int mxtotsiz = 0;
     int ibank = 0;
-#pragma omp parallel for schedule(dynamic) if (serial)
+    //For some unknown reason the h2_mag_lda test sometimes fails when schedule(dynamic) is chosen
+#pragma omp parallel for schedule(static) if (serial)
     for (int n = 0; n < max_n; n++) {
         if (n % mrcpp::mpi::wrk_size != mrcpp::mpi::wrk_rank) continue;
         int csize;
@@ -1749,10 +1769,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
                 S_temp.noalias() = coeffBlockBra.transpose() * coeffBlockKet;
                 for (int i = 0; i < orbVecBra.size(); i++) {
                     for (int j = 0; j < orbVecKet.size(); j++) {
-                        // if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta)
+                            continue;
+                        if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha)
+                            continue;
                         // must ensure that threads are not competing
                         double &Srealij = Sreal(orbVecBra[i], orbVecKet[j]);
                         double &Stempij = S_temp(i, j);
@@ -1761,7 +1781,6 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
                     }
                 }
             }
-
         } else {
 
             DoubleMatrix coeffBlockBra(csize, 2 * N);
@@ -1778,10 +1797,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
                 S_temp.noalias() = coeffBlockBra.transpose() * coeffBlockKet;
                 for (int i = 0; i < orbVecBra.size(); i++) {
                     for (int j = 0; j < orbVecKet.size(); j++) {
-                        // if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta)
+                            continue;
+                        if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha)
+                            continue;
                         Sreal(orbVecBra[i], orbVecKet[j]) += S_temp(i, j);
                     }
                 }
@@ -1813,12 +1832,6 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
 
     mrcpp::mpi::allreduce_matrix(S, mrcpp::mpi::comm_wrk);
 
-    conjMatKet[0] = totsiz;
-    conjMatKet[1] = mxtotsiz;
-    conjMatKet[2] = totget;
-
-    mrcpp::mpi::allreduce_vector(conjMatKet, mrcpp::mpi::comm_wrk);
-
     return S;
 }
 
@@ -1922,10 +1935,10 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) {
                 S_temp.noalias() = coeffBlock.transpose() * coeffBlock;
                 for (int i = 0; i < orbVec.size(); i++) {
                     for (int j = 0; j < orbVec.size(); j++) {
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
+                            continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
+                            continue;
                         double &Srealij = Sreal(orbVec[i], orbVec[j]);
                         double &Stempij = S_temp(i, j);
 #pragma omp atomic
@@ -1944,10 +1957,10 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) {
                 S_temp.noalias() = coeffBlock.transpose() * coeffBlock;
                 for (int i = 0; i < orbVec.size(); i++) {
                     for (int j = 0; j < orbVec.size(); j++) {
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
-                        //    continue;
-                        // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
-                        //    continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta)
+                            continue;
+                        if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha)
+                            continue;
                         Sreal(orbVec[i], orbVec[j]) += S_temp(i, j);
                     }
                 }
@@ -1971,9 +1984,33 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) {
 
     // Assumes linearity: result is sum of all nodes contributions
     mrcpp::mpi::allreduce_matrix(S, mrcpp::mpi::comm_wrk);
-
     return S;
 }
 
+/** @brief Orthogonalize the functions in Bra against all orbitals in Ket
+ *
+ */
+void orthogonalize(double prec, MPI_FuncVector &Bra, MPI_FuncVector &Ket) {
+    // TODO: generalize for cases where Ket functions are not orthogonal to each other?
+    ComplexMatrix S = mpifuncvec::calc_overlap_matrix(Bra, Ket);
+    int N = Bra.size();
+    int M = Ket.size();
+    DoubleVector Ketnorms = DoubleVector::Zero(M);
+    for (int i = 0; i < M; i++) {
+        if (mpi::my_orb(Ket[i])) Ketnorms(i)  = Ket[i].squaredNorm();
+    }
+    mrcpp::mpi::allreduce_vector(Ketnorms, mrcpp::mpi::comm_wrk);
+    ComplexMatrix rmat =  ComplexMatrix::Zero(M, N);
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < M; i++) {
+            rmat(i,j) = 0.0 - S.conjugate()(j,i)/Ketnorms(i);
+        }
+    }
+    MPI_FuncVector rotatedKet(N);
+    mpifuncvec::rotate(Ket, rmat, rotatedKet, prec / M);
+    for (int j = 0; j < N; j++) {
+        if(my_orb(Bra[j]))Bra[j].add(1.0,rotatedKet[j]);
+    }
+}
 } // namespace mpifuncvec
 } // namespace mrcpp
diff --git a/src/utils/ComplexFunction.h b/src/utils/ComplexFunction.h
index 8fbd78bc5..cf33cadfb 100644
--- a/src/utils/ComplexFunction.h
+++ b/src/utils/ComplexFunction.h
@@ -186,6 +186,7 @@ class MPI_FuncVector : public std::vector<ComplexFunction> {
 
 namespace mpifuncvec {
 void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec = -1.0);
+void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec = -1.0);
 void save_nodes(MPI_FuncVector &Phi, mrcpp::FunctionTree<3> &refTree, BankAccount &account, int sizes = -1);
 MPI_FuncVector multiply(MPI_FuncVector &Phi, RepresentableFunction<3> &f, double prec = -1.0, ComplexFunction *Func = nullptr, int nrefine = 1, bool all = false);
 ComplexVector dot(MPI_FuncVector &Bra, MPI_FuncVector &Ket);
@@ -193,5 +194,6 @@ ComplexMatrix calc_lowdin_matrix(MPI_FuncVector &Phi);
 ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet);
 ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket);
 DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet);
+void orthogonalize(double prec, MPI_FuncVector &Bra, MPI_FuncVector &Ket);
 } // namespace mpifuncvec
 } // namespace mrcpp
diff --git a/src/utils/mpi_utils.h b/src/utils/mpi_utils.h
index c2efb0c21..1b94b0dc9 100644
--- a/src/utils/mpi_utils.h
+++ b/src/utils/mpi_utils.h
@@ -51,6 +51,7 @@ extern int sh_group_rank;
 extern int is_bank;
 extern int is_bankclient;
 extern int bank_size;
+extern int omp_threads;
 extern int tot_bank_size;
 extern int max_tag;
 extern int task_bank;
diff --git a/src/utils/omp_utils.cpp b/src/utils/omp_utils.cpp
index 0ea4d5bdc..67ccd3069 100644
--- a/src/utils/omp_utils.cpp
+++ b/src/utils/omp_utils.cpp
@@ -24,19 +24,18 @@
  */
 
 #include "omp_utils.h"
-
+#include <iostream>
 namespace mrcpp {
 
 // By default we get OMP_NUM_THREADS
 int max_threads = mrcpp_get_max_threads();
 
 void set_max_threads(int threads) {
-    auto omp_max = mrcpp_get_max_threads();
-    if (threads < 1 or threads > omp_max) {
-        mrcpp::max_threads = omp_max;
-    } else {
-        mrcpp::max_threads = threads;
-    }
+    threads = std::max(1, threads);
+    mrcpp::max_threads = threads;
+#ifdef MRCPP_HAS_OMP
+    omp_set_num_threads(threads);
+#endif
 }
 
 } // namespace mrcpp
diff --git a/src/utils/parallel.cpp b/src/utils/parallel.cpp
index 5198ac307..2877d12e9 100644
--- a/src/utils/parallel.cpp
+++ b/src/utils/parallel.cpp
@@ -2,6 +2,7 @@
 #include <MRCPP/Printer>
 #include <MRCPP/Timer>
 #include <vector>
+#include <thread>
 
 #include "Bank.h"
 #include "ComplexFunction.h"
@@ -11,15 +12,18 @@
 
 #ifdef MRCPP_HAS_OMP
 #define mrcpp_get_max_threads() omp_get_max_threads()
+#define mrcpp_get_num_procs() omp_get_num_procs()/2
 #define mrcpp_set_dynamic(n) omp_set_dynamic(n)
 #else
 #define mrcpp_get_max_threads() 1
+#define mrcpp_get_num_procs() 1
 #define mrcpp_get_num_threads() 1
 #define mrcpp_get_thread_num() 0
 #define mrcpp_set_dynamic(n)
 #endif
 
 using mrcpp::Printer;
+using namespace std;
 
 namespace mrcpp {
 
@@ -51,9 +55,10 @@ int is_centralbank = 0;
 int is_bankclient = 1;
 int is_bankmaster = 0; // only one bankmaster is_bankmaster
 int bank_size = 0;
+int omp_threads = -1; // can be set to force number of threads
 int tot_bank_size = 0; // size of bank, including the task manager
 int max_tag = 0;       // max value allowed by MPI
-std::vector<int> bankmaster;
+vector<int> bankmaster;
 int task_bank = -1; // world rank of the task manager
 
 MPI_Comm comm_wrk;
@@ -71,7 +76,6 @@ extern int const size_metadata = 3;
 void mpi::initialize() {
     Eigen::setNbThreads(1);
     mrcpp_set_dynamic(0);
-    mrcpp::set_max_threads(omp::n_threads);
 
 #ifdef MRCPP_HAS_MPI
     MPI_Init(nullptr, nullptr);
@@ -90,7 +94,7 @@ void mpi::initialize() {
     if (mpi::world_size < 2) {
         mpi::bank_size = 0;
     } else if (mpi::bank_size < 0) {
-        mpi::bank_size = std::max(mpi::world_size / 3, 1);
+        mpi::bank_size = max(mpi::world_size / 3, 1);
     }
     if (mpi::world_size - mpi::bank_size < 1) MSG_ABORT("No MPI ranks left for working!");
     if (mpi::bank_size < 1 and mpi::world_size > 1) MSG_ABORT("Bank size must be at least one when using MPI!");
@@ -154,6 +158,62 @@ void mpi::initialize() {
     max_tag = *(int *)val / 2;
     id_shift = max_tag / 2; // half is reserved for non orbital.
 
+    // determine the number of threads we can assign to each mpi worker.
+    // mrcpp_get_num_procs is total number of hardware logical threads accessible by this mpi
+    // We assume that half of them are physical cores.
+    // mrcpp_get_max_threads is OMP_NUM_THREADS (environment variable).
+    // omp_threads_available is the total number of logical threads available on this compute-node
+    // We assume that half of them are physical cores.
+    //
+    // six conditions should be satisfied:
+    // 1) no one use more than mrcpp_get_num_procs()/2
+    // 2) NOT ENFORCED: no one use more than mrcpp_get_max_threads, as defined by rank 0
+    // 3) the total number of threads used on the compute-node must not exceed omp_threads_available/2
+    // 4) Bank needs only one thread
+    // 5) workers need as many threads as possible
+    // 6) at least one thread
+
+    MPI_Comm comm_share_world;//all that share the memory
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_share_world);
+
+    int n_bank_thisnode; //number of banks on this node
+    MPI_Allreduce(&is_bank, &n_bank_thisnode, 1, MPI_INT, MPI_SUM, comm_share_world);
+    int n_wrk_thisnode; //number of workers on this node
+    MPI_Allreduce(&is_bankclient, &n_wrk_thisnode, 1, MPI_INT, MPI_SUM, comm_share_world);
+
+    int omp_threads_available = thread::hardware_concurrency();
+    int nthreads = 1;
+    if (is_bankclient) nthreads = (omp_threads_available/2-n_bank_thisnode)/n_wrk_thisnode; // 3) and 5)
+
+    // do not exceed total number of cores accessible (assumed to be half the number of logical threads)
+    nthreads = min(nthreads, mrcpp_get_num_procs()); // 1)
+
+    // NB: we do not use OMP_NUM_THREADS. Use all cores accessible. Could change this in the future
+    // if OMP_NUM_THREADS is set, do not exceed
+    // we enforce that all compute nodes use the same OMP_NUM_THREADS. Rank 0 decides.
+    /* int my_OMP_NUM_THREADS = mrcpp_get_max_threads();
+    MPI_Bcast(&my_OMP_NUM_THREADS, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if (my_OMP_NUM_THREADS > 0) nthreads = min(nthreads, my_OMP_NUM_THREADS); // 2)
+    */
+
+    nthreads = max(1, nthreads); // 6)
+
+    if (is_bank) nthreads = 1; // 4)
+
+    //cout<<world_rank<<" found "<<omp_threads_available<<" available threads. omp:"<<omp_get_num_procs()<<" "<<omp_get_max_threads()<<" "<<mrcpp::omp::n_threads<<" On this node: "<<n_bank_thisnode<<" banks "<<n_wrk_thisnode<<" workers"<<" "<<nthreads<<" is bank "<<is_bank<<endl;
+
+    if (omp_threads > 0) {
+        if (omp_threads != nthreads and world_rank == 0) {
+            cout<<"Warning: recommended number of threads is "<<nthreads<<endl;
+            cout<<"setting number of threads to omp_threads, "<<omp_threads<<endl;
+        }
+        nthreads = omp_threads;
+    }
+
+    omp::n_threads = nthreads;
+    mrcpp::set_max_threads(nthreads);
+
     if (mpi::is_bank) {
         // bank is open until end of program
         if (mpi::is_centralbank) { dataBank.open(); }
@@ -162,6 +222,7 @@ void mpi::initialize() {
     }
 #else
     mpi::bank_size = 0;
+    mrcpp::set_max_threads(omp::n_threads);
 #endif
 }
 
@@ -231,7 +292,7 @@ void mpi::allreduce_vector(DoubleVector &vec, MPI_Comm comm) {
 void mpi::allreduce_vector(ComplexVector &vec, MPI_Comm comm) {
 #ifdef MRCPP_HAS_MPI
     int N = vec.size();
-    MPI_Allreduce(MPI_IN_PLACE, vec.data(), N, MPI_CXX_DOUBLE_COMPLEX, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, vec.data(), N, MPI_C_DOUBLE_COMPLEX, MPI_SUM, comm);
 #endif
 }
 
@@ -255,7 +316,7 @@ void mpi::allreduce_matrix(DoubleMatrix &mat, MPI_Comm comm) {
 void mpi::allreduce_matrix(ComplexMatrix &mat, MPI_Comm comm) {
 #ifdef MRCPP_HAS_MPI
     int N = mat.size();
-    MPI_Allreduce(MPI_IN_PLACE, mat.data(), N, MPI_CXX_DOUBLE_COMPLEX, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, mat.data(), N, MPI_C_DOUBLE_COMPLEX, MPI_SUM, comm);
 #endif
 }
 
@@ -388,7 +449,7 @@ void mpi::reduce_Tree_noCoeff(mrcpp::FunctionTree<3> &tree, MPI_Comm comm) {
 /** @brief make union tree without coeff and send to all
  *  Include both real and imaginary parts
  */
-void mpi::allreduce_Tree_noCoeff(mrcpp::FunctionTree<3> &tree, std::vector<ComplexFunction> &Phi, MPI_Comm comm) {
+void mpi::allreduce_Tree_noCoeff(mrcpp::FunctionTree<3> &tree, vector<ComplexFunction> &Phi, MPI_Comm comm) {
     /* 1) make union grid of own orbitals
        2) make union grid with others orbitals (sent to rank zero)
        3) rank zero broadcast func to everybody
diff --git a/tests/trees/tree_io.cpp b/tests/trees/tree_io.cpp
index 12af6f42a..b193382da 100644
--- a/tests/trees/tree_io.cpp
+++ b/tests/trees/tree_io.cpp
@@ -37,7 +37,7 @@ namespace tree_io {
 template <int D> void testProjectFunction();
 
 SCENARIO("FunctionTree IO", "[tree_io], [trees]") {
-    const double prec = 1.0e-4;
+    const double prec = 1.0e-6;
 
     GaussFunc<3> *func = nullptr;
     initialize(&func);