diff --git a/api/MWOperators b/api/MWOperators index b61ee9075..3d6bc2793 100644 --- a/api/MWOperators +++ b/api/MWOperators @@ -38,3 +38,4 @@ #include "operators/BSOperator.h" #include "treebuilders/apply.h" +#include "treebuilders/DerivativeCalculator.h" diff --git a/docs/programmers_manual/MWTree.rst b/docs/programmers_manual/MWTree.rst index 67614b631..83abc383e 100644 --- a/docs/programmers_manual/MWTree.rst +++ b/docs/programmers_manual/MWTree.rst @@ -16,4 +16,3 @@ algorithm/equation/structure reasoning for having this class or where it fits wi :members: :protected-members: :private-members: - diff --git a/docs/programmers_manual/MultiResolutionAnalysis.rst b/docs/programmers_manual/MultiResolutionAnalysis.rst new file mode 100644 index 000000000..dfec65d66 --- /dev/null +++ b/docs/programmers_manual/MultiResolutionAnalysis.rst @@ -0,0 +1,16 @@ +------------------ +MultiResolutionAnalysis +------------------ + +The MultiResolutionAnalysis (MRA) class contains the methods to project objects onto the spatial grid. +That is, to combine different functions and operators in mathematical operations, they need to be compatible; +they must be defined on the same computational domain and constructed using the same polynomial basis +(order and type). This information constitutes an MRA, which needs to be defined and passed as argument to +all function and operator constructors, and only functions and operators with compatible MRAs can be +combined in subsequent calculations. + +.. doxygenclass:: mrcpp::MultiResolutionAnalysis + :members: + :protected-members: + :private-members: + diff --git a/docs/programmers_manual/overview.rst b/docs/programmers_manual/overview.rst index 7ff6ff4cf..e78ab5cb7 100644 --- a/docs/programmers_manual/overview.rst +++ b/docs/programmers_manual/overview.rst @@ -13,6 +13,7 @@ TODO: maybe add some low level theory/figures/algorithms before showing classes, OperatorNode OperatorTree BoundingBox + MultiResolutionAnalysis CrossCorrelationCache LegendreBasis InterpolatingBasis @@ -20,12 +21,12 @@ TODO: maybe add some low level theory/figures/algorithms before showing classes, ConvolutionCalculator CornerOperatorTree + TimeEvolutionOperator JpowerIntegrals special_functions SchrodingerEvolution_CrossCorrelation TimeEvolution_CrossCorrelationCalculator complex_apply - HeatOperator HeatKernel diff --git a/src/functions/RepresentableFunction.h b/src/functions/RepresentableFunction.h index 524484868..2d6998812 100644 --- a/src/functions/RepresentableFunction.h +++ b/src/functions/RepresentableFunction.h @@ -31,11 +31,13 @@ #pragma once +#include #include #include #include "MRCPP/constants.h" #include "MRCPP/mrcpp_declarations.h" +#include "trees/NodeIndex.h" namespace mrcpp { @@ -74,4 +76,15 @@ template class RepresentableFunction { virtual bool isZeroOnInterval(const double *a, const double *b) const { return false; } }; +/* + * Same as RepresentableFunction, but output a matrix of values + * for all points in a node, given its NodeIndex. + * + */ +class RepresentableFunction_M { +public: + RepresentableFunction_M() {} + virtual Eigen::MatrixXd evalf(mrcpp::NodeIndex<3> nIdx) const = 0; +}; + } // namespace mrcpp diff --git a/src/operators/IdentityConvolution.cpp b/src/operators/IdentityConvolution.cpp index 8e9bea624..5b8bde3af 100644 --- a/src/operators/IdentityConvolution.cpp +++ b/src/operators/IdentityConvolution.cpp @@ -29,7 +29,7 @@ namespace mrcpp { -/** @brief Constructor of the IdentityConvolution object +/** @brief Constructor of the IdentityConvolution object * @returns New IdentityConvolution object * @param[in] mra: Which MRA the operator is defined * @param[in] prec: Build precision, closeness to delta function diff --git a/src/treebuilders/DerivativeCalculator.cpp b/src/treebuilders/DerivativeCalculator.cpp index 97b82414f..a5acdc297 100644 --- a/src/treebuilders/DerivativeCalculator.cpp +++ b/src/treebuilders/DerivativeCalculator.cpp @@ -86,6 +86,34 @@ template void DerivativeCalculator::printTimers() const { Printer::setPrecision(oldprec); } +template void DerivativeCalculator::calcNode(MWNode &inpNode, MWNode &outNode) { + //if (this->oper->getMaxBandWidth() > 1) MSG_ABORT("Only implemented for zero bw"); + outNode.zeroCoefs(); + int nComp = (1 << D); + double tmpCoefs[outNode.getNCoefs()]; + OperatorState os(outNode, tmpCoefs); + + os.setFNode(inpNode); + os.setFIndex(inpNode.nodeIndex); + for (int ft = 0; ft < nComp; ft++) { + double fNorm = inpNode.getComponentNorm(ft); + if (fNorm < MachineZero) { continue; } // Could this be used outside the loop? + os.setFComponent(ft); + for (int gt = 0; gt < nComp; gt++) { + os.setGComponent(gt); + applyOperator_bw0(os); + } + } + // Multiply appropriate scaling factor. TODO: Could be included elsewhere + const double scaling_factor = + 1.0/std::pow(outNode.getMWTree().getMRA().getWorldBox().getScalingFactor(this->applyDir), oper->getOrder()); + if(abs(scaling_factor-1.0)>MachineZero){ + for (int i = 0; i < outNode.getNCoefs(); i++) outNode.getCoefs()[i] *= scaling_factor; + } + outNode.calcNorms(); //TODO:required? norms are not used for now +} + + template void DerivativeCalculator::calcNode(MWNode &gNode) { gNode.zeroCoefs(); @@ -101,6 +129,7 @@ template void DerivativeCalculator::calcNode(MWNode &gNode) { this->band_t[mrcpp_get_thread_num()].stop(); this->calc_t[mrcpp_get_thread_num()].resume(); + for (int n = 0; n < fBand.size(); n++) { MWNode &fNode = *fBand[n]; NodeIndex &fIdx = idx_band[n]; @@ -152,6 +181,39 @@ MWNodeVector DerivativeCalculator::makeOperBand(const MWNode &gNode, st return band; } +/** Apply a single operator component (term) to a single f-node assuming zero bandwidth */ +template void DerivativeCalculator::applyOperator_bw0(OperatorState &os) { + //cout<<" applyOperator "< &gNode = *os.gNode; + MWNode &fNode = *os.fNode; + const NodeIndex &fIdx = *os.fIdx; + const NodeIndex &gIdx = gNode.getNodeIndex(); + int depth = gNode.getDepth(); + + double oNorm = 1.0; + double **oData = os.getOperData(); + + for (int d = 0; d < D; d++) { + const OperatorTree &oTree = this->oper->getComponent(0, d); + const OperatorNode &oNode = oTree.getNode(depth, 0); + int oIdx = os.getOperIndex(d); + if (this->applyDir == d) { + oData[d] = const_cast(oNode.getCoefs()) + oIdx * os.kp1_2; + } else { + if (oIdx == 0 or oIdx == 3) { + // This will activate the identity operator in direction i + oData[d] = nullptr; + } else { + // This means that we are in a zero part of the identity operator + return; + } + } + } + this->operStat.incrementFNodeCounters(fNode, os.ft, os.gt); + tensorApplyOperComp(os); +} + + /** Apply a single operator component (term) to a single f-node. Whether the operator actualy is applied is determined by a screening threshold. */ template void DerivativeCalculator::applyOperator(OperatorState &os) { @@ -197,8 +259,8 @@ template void DerivativeCalculator::applyOperator(OperatorState &o tensorApplyOperComp(os); } -/** Perorm the required linear algebra operations in order to apply an -operator component to a f-node in a n-dimensional tesor space. */ +/** Perform the required linear algebra operations in order to apply an +operator component to a f-node in a n-dimensional tensor space. */ template void DerivativeCalculator::tensorApplyOperComp(OperatorState &os) { double **aux = os.getAuxData(); double **oData = os.getOperData(); diff --git a/src/treebuilders/DerivativeCalculator.h b/src/treebuilders/DerivativeCalculator.h index 67dbac01c..5d4d28716 100644 --- a/src/treebuilders/DerivativeCalculator.h +++ b/src/treebuilders/DerivativeCalculator.h @@ -36,6 +36,7 @@ template class DerivativeCalculator final : public TreeCalculator { ~DerivativeCalculator() override; MWNodeVector *getInitialWorkVector(MWTree &tree) const override; + void calcNode(MWNode &fNode, MWNode &gNode); private: int applyDir; @@ -61,6 +62,7 @@ template class DerivativeCalculator final : public TreeCalculator { } void applyOperator(OperatorState &os); + void applyOperator_bw0(OperatorState &os); void tensorApplyOperComp(OperatorState &os); }; diff --git a/src/treebuilders/apply.cpp b/src/treebuilders/apply.cpp index ce5c08e53..3dc49de3c 100644 --- a/src/treebuilders/apply.cpp +++ b/src/treebuilders/apply.cpp @@ -256,6 +256,7 @@ template void apply_far_field(double prec, FunctionTree &out, Convolu template void apply_near_field(double prec, FunctionTree &out, ConvolutionOperator &oper, FunctionTree &inp, int maxIter, bool absPrec) { apply_on_unit_cell(true, prec, out, oper, inp, maxIter, absPrec); } + /** @brief Application of MW derivative operator * * @param[out] out: Output function to be built @@ -274,7 +275,6 @@ template void apply_near_field(double prec, FunctionTree &out, Convol */ template void apply(FunctionTree &out, DerivativeOperator &oper, FunctionTree &inp, int dir) { if (out.getMRA() != inp.getMRA()) MSG_ABORT("Incompatible MRA"); - TreeBuilder builder; int maxScale = out.getMRA().getMaxScale(); diff --git a/src/trees/FunctionNode.cpp b/src/trees/FunctionNode.cpp index 2c8eee329..c839e2b57 100644 --- a/src/trees/FunctionNode.cpp +++ b/src/trees/FunctionNode.cpp @@ -130,7 +130,6 @@ template double FunctionNode::integrateInterpolating() const { int qOrder = this->getKp1(); getQuadratureCache(qc); const VectorXd &weights = qc.getWeights(qOrder); - double sqWeights[qOrder]; for (int i = 0; i < qOrder; i++) sqWeights[i] = std::sqrt(weights[i]); @@ -158,6 +157,52 @@ template double FunctionNode::integrateInterpolating() const { return two_n * sum; } +/** Function integration, Interpolating basis. + * + * Integrates the function represented on the node on the full support of the + * node. A bit more involved than in the Legendre basis, as is requires some + * coupling of quadrature weights. */ +template double FunctionNode::integrateValues() const { + int qOrder = this->getKp1(); + getQuadratureCache(qc); + const VectorXd &weights = qc.getWeights(qOrder); + VectorXd coefs; + this->getCoefs(coefs); + int ncoefs = coefs.size(); + int ncoefChild = ncoefs/(1< 3) MSG_ABORT("Not Implemented") + else if (D == 3) { + for (int i = 0; i < qOrder; i++) { + double sumj = 0.0; + for (int j = 0; j < qOrder; j++) { + double sumk = 0.0; + for (int k = 0; k < qOrder; k++) sumk += cc[nc++] * weights[k]; + sumj += sumk * weights[j]; + } + sum += sumj * weights[i]; + } + } else if (D==2) { + for (int j = 0; j < qOrder; j++) { + double sumk = 0.0; + for (int k = 0; k < qOrder; k++) sumk += cc[nc++] * weights[k]; + sum += sumk * weights[j]; + } + } else if (D==1) for (int k = 0; k < qOrder; k++) sum += cc[nc++] * weights[k]; + + int n = D * (this->getScale() + 1) ; // NB: one extra scale + int two_n = (1<0)sum/=two_n; + else sum*=two_n; + return sum; +} + template void FunctionNode::setValues(const VectorXd &vec) { this->zeroCoefs(); this->setCoefBlock(0, vec.size(), vec.data()); @@ -319,6 +364,9 @@ template void FunctionNode::dealloc() { auto &ftree = this->getFuncTree(); if (this->isGenNode()) { ftree.getGenNodeAllocator().dealloc(sIdx); + // for GenNodes we clean unused chunks carefully, as they can become + // very large and occupy space long after used. + ftree.getGenNodeAllocator().deleteUnusedChunks(); } else { ftree.decrementNodeCount(this->getScale()); ftree.getNodeAllocator().dealloc(sIdx); diff --git a/src/trees/FunctionNode.h b/src/trees/FunctionNode.h index c110e12d8..97a3d74d3 100644 --- a/src/trees/FunctionNode.h +++ b/src/trees/FunctionNode.h @@ -77,6 +77,7 @@ template class FunctionNode final : public MWNode { double integrateLegendre() const; double integrateInterpolating() const; + double integrateValues() const; }; template double dot_scaling(const FunctionNode &bra, const FunctionNode &ket); diff --git a/src/trees/FunctionTree.cpp b/src/trees/FunctionTree.cpp index 6d27270ac..47614a933 100644 --- a/src/trees/FunctionTree.cpp +++ b/src/trees/FunctionTree.cpp @@ -54,7 +54,7 @@ template FunctionTree::FunctionTree(const MultiResolutionAnalysis &mra, SharedMemory *sh_mem, const std::string &name) : MWTree(mra, name) , RepresentableFunction(mra.getWorldBox().getLowerBounds().data(), mra.getWorldBox().getUpperBounds().data()) { - int nodesPerChunk = 64; + int nodesPerChunk = 2048; // Large chunks are required for not leading to memory fragmentation (32 MB on "Betzy" 2023) int coefsGenNodes = this->getKp1_d(); int coefsRegNodes = this->getTDim() * this->getKp1_d(); this->nodeAllocator_p = std::make_unique>(this, sh_mem, coefsRegNodes, nodesPerChunk); @@ -186,6 +186,42 @@ template double FunctionTree::integrate() const { return jacobian * result; } + +/** @returns Integral of a representable function over the grid given by the tree */ +template <> double FunctionTree<3>::integrateEndNodes(RepresentableFunction_M &f) { + //traverse tree, and treat end nodes only + std::vector *> stack; // node from this + for (int i = 0; i < this->getRootBox().size(); i++) stack.push_back(&(this->getRootFuncNode(i))); + int basis = getMRA().getScalingBasis().getScalingType(); + double result = 0.0; + while (stack.size() > 0) { + FunctionNode<3> *Node = stack.back(); + stack.pop_back(); + if (Node->getNChildren() > 0) { + for (int i = 0; i < Node->getNChildren(); i++) stack.push_back(&(Node->getFuncChild(i))); + } else { + //end nodes + Eigen::MatrixXd fmat = f.evalf(Node->nodeIndex); + double *coefs = Node->getCoefs(); // save position of coeff, but do not use them! + // The data in fmat is not organized so that two consecutive points are stored after each other in memory, so needs to copy before mwtransform, cannot use memory adress directly. + int nc=fmat.cols(); + double cc[nc]; + for (int i = 0; i < nc; i++)cc[i]=fmat(0,i); + Node->attachCoefs(cc); + result += Node->integrateValues(); + Node->attachCoefs(coefs); // put back original coeff + } + } + + // Handle potential scaling + auto scaling_factor = this->getMRA().getWorldBox().getScalingFactors(); + auto jacobian = 1.0; + for (const auto &sf_i : scaling_factor) jacobian *= std::sqrt(sf_i); + // Square root of scaling factor in each diection. The seemingly missing + // multiplication by the square root of sf_i is included in the basis + return jacobian * result; +} + /** @returns Function value in a point, out of bounds returns zero * * @param[in] r: Cartesian coordinate diff --git a/src/trees/FunctionTree.h b/src/trees/FunctionTree.h index 613399109..0be9563ea 100644 --- a/src/trees/FunctionTree.h +++ b/src/trees/FunctionTree.h @@ -62,6 +62,7 @@ template class FunctionTree final : public MWTree, public Representab ~FunctionTree() override; double integrate() const; + double integrateEndNodes(RepresentableFunction_M &f); double evalf_precise(const Coord &r); double evalf(const Coord &r) const override; diff --git a/src/trees/MWNode.cpp b/src/trees/MWNode.cpp index 927019f63..d15c0939f 100644 --- a/src/trees/MWNode.cpp +++ b/src/trees/MWNode.cpp @@ -103,7 +103,7 @@ MWNode::MWNode(MWTree *tree, int rIdx) /** @brief MWNode constructor. * * @param[in] parent: parent node - * @param[in] cIdx: child index of the current node + * @param[in] cIdx: child index of the current node * * @details Constructor for leaf nodes. It requires the corresponding * parent and an integer to identify the correct child. @@ -131,7 +131,7 @@ MWNode::MWNode(MWNode *parent, int cIdx) * the tree. */ template -MWNode::MWNode(const MWNode &node, bool allocCoef) +MWNode::MWNode(const MWNode &node, bool allocCoef, bool SetCoef) : tree(node.tree) , parent(nullptr) , nodeIndex(node.nodeIndex) @@ -141,7 +141,7 @@ MWNode::MWNode(const MWNode &node, bool allocCoef) setIsLooseNode(); if (allocCoef) { allocCoefs(this->getTDim(), this->getKp1_d()); - if (node.hasCoefs()) { + if (node.hasCoefs() and SetCoef) { setCoefBlock(0, node.getNCoefs(), node.getCoefs()); if (this->getNCoefs() > node.getNCoefs()) { for (int i = node.getNCoefs(); i < this->getNCoefs(); i++) this->coefs[i] = 0.0; @@ -236,7 +236,7 @@ template void MWNode::getCoefs(Eigen::VectorXd &c) const { c = VectorXd::Map(this->coefs, this->n_coefs); } -/** @brief sets all MW coefficients and the norms to zero +/** @brief sets all MW coefficients and the norms to zero * */ template void MWNode::zeroCoefs() { @@ -388,7 +388,7 @@ template void MWNode::giveParentCoefs(bool overwrite) { } /** @brief Copy scaling coefficients from children to parent - * + * * @details Takes the scaling coefficients of the children and stores * them consecutively in the corresponding block of the parent, * following the usual bitwise notation. @@ -404,10 +404,12 @@ template void MWNode::copyCoefsFromChildren() { } /** @brief Generates scaling cofficients of children - * + * * @details If the node is a leafNode, it takes the scaling&wavelet * coefficients of the parent and it generates the scaling - * coefficients for the children + * coefficients for the children and stores + * them consecutively in the corresponding block of the parent, + * following the usual bitwise notation. */ template void MWNode::threadSafeGenChildren() { if (tree->isLocal) { NOT_IMPLEMENTED_ABORT; } @@ -835,7 +837,7 @@ template void MWNode::getPrimitiveQuadPts(MatrixXd &pts) const { * the set of quadrature points becomes \f$ x^\alpha_i = 2^{-n-1} (x_i + 2 l^\alpha + t^\alpha) \f$, where \f$ t^\alpha = * 0,1 \f$. By taking all possible \f$(k+1)^d\combinations \f$, they will * then define a d-dimensional grid of quadrature points for the child - * nodes. + * nodes. * */ template void MWNode::getPrimitiveChildPts(MatrixXd &pts) const { diff --git a/src/trees/MWNode.h b/src/trees/MWNode.h index 2b398c78c..d7d7a18a7 100644 --- a/src/trees/MWNode.h +++ b/src/trees/MWNode.h @@ -47,13 +47,13 @@ namespace mrcpp { * set thoucgh the template parameter D=1,2,3. In addition to the * coefficients the node contains metadata such as the scale, the * translation index, the norm, pointers to parent node and child - * nodes, pointer to the corresponding MWTree etc... See memeber and + * nodes, pointer to the corresponding MWTree etc... See member and * data descriptions for details. * */ template class MWNode { public: - MWNode(const MWNode &node, bool allocCoef = true); + MWNode(const MWNode &node, bool allocCoef = true, bool SetCoef = true); MWNode &operator=(const MWNode &node) = delete; virtual ~MWNode(); @@ -164,6 +164,7 @@ template class MWNode { friend class OperatorTree; friend class FunctionNode; friend class OperatorNode; + friend class DerivativeCalculator; protected: MWTree *tree{nullptr}; ///< Tree the node belongs to @@ -171,7 +172,7 @@ template class MWNode { MWNode *children[1 << D]; ///< 2^D children double squareNorm{-1.0}; ///< Squared norm of all 2^D (k+1)^D coefficients - double componentNorms[1 << D]; ///< Squared norms of the separeted 2^D components + double componentNorms[1 << D]; ///< Squared norms of the separeted 2^D components double maxSquareNorm{-1.0}; ///< Largest squared norm among itself and descendants. double maxWSquareNorm{-1.0}; ///< Largest wavelet squared norm among itself and descendants. ///< NB: must be set before used. diff --git a/src/trees/MWTree.h b/src/trees/MWTree.h index 1587aa644..51cfe3eed 100644 --- a/src/trees/MWTree.h +++ b/src/trees/MWTree.h @@ -39,7 +39,7 @@ namespace mrcpp { class BankAccount; - + /** @class MWTree * * @brief Base class for Multiwavelet tree structures, such as FunctionTree and OperatorTree diff --git a/src/trees/MultiResolutionAnalysis.cpp b/src/trees/MultiResolutionAnalysis.cpp index 4ab7493bb..6eaabb120 100644 --- a/src/trees/MultiResolutionAnalysis.cpp +++ b/src/trees/MultiResolutionAnalysis.cpp @@ -32,6 +32,22 @@ namespace mrcpp { +/** @returns New MultiResolutionAnalysis (MRA) object + * + * @brief Constructs a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets) + * + * @param[in] bb: 2-element integer array [Lower, Upper] defining the bounds for a BoundingBox object representing the computational domain + * @param[in] order: Maximum polynomial order of the multiwavelet basis, + * immediately used in the constructor of an InterPolatingBasis object which becomes an attribute of the MRA + * @param[in] maxDepth: Exponent of the node refinement in base 2, relative to root scale. + * In other words, it is the maximum amount of refinement that we allow in a node, in other to avoid overflow of values. + * + * @details Constructor of the MultiResolutionAnalysis class from scratch, without requiring any pre-existing complex structure. + * The constructor calls the InterpolatingBasis basis constructor to generate the MultiWavelets basis of functions, + * then the BoundingBox constructor to create the computational domain. The constructor then checks if the generated node depth, or + * node refinement is beyond the root scale or the maximum depth allowed, in which case it will abort the process. + * Otherwise, the process goes on to setup the filters with the class' setupFilter method. + */ template MultiResolutionAnalysis::MultiResolutionAnalysis(std::array bb, int order, int depth) : maxDepth(depth) @@ -42,6 +58,18 @@ MultiResolutionAnalysis::MultiResolutionAnalysis(std::array bb, int o setupFilter(); } +/** @returns New MultiResolutionAnalysis (MRA) object + * + * @brief Constructs a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets) from a pre-existing BoundingBox object + * + * @param[in] bb: BoundingBox object representing the computational domain + * @param[in] order: (integer) Maximum polynomial order of the multiwavelet basis, + * immediately used in the constructor of an InterPolatingBasis object which becomes an attribute of the MRA + * @param[in] maxDepth: (integer) Exponent of the node refinement in base 2, relative to root scale. + * In other words, it is the maximum amount of refinement that we allow in a node, in other to avoid overflow of values. + * + * @details Constructor of the MultiResolutionAnalysis class from a BoundingBox object. For more details see the first constructor. + */ template MultiResolutionAnalysis::MultiResolutionAnalysis(const BoundingBox &bb, int order, int depth) : maxDepth(depth) @@ -52,6 +80,14 @@ MultiResolutionAnalysis::MultiResolutionAnalysis(const BoundingBox &bb, in setupFilter(); } +/** @returns New MultiResolutionAnalysis (MRA) object + * + * @brief Copy constructor for a MultiResolutionAnalysis object composed of computational domain (world) and a polynomial basis (Multiwavelets) + * + * @param[in] mra: Pre-existing MRA object + * + * @details Copy a MultiResolutionAnalysis object without modifying the original. For more details see the first constructor. + */ template MultiResolutionAnalysis::MultiResolutionAnalysis(const MultiResolutionAnalysis &mra) : maxDepth(mra.maxDepth) @@ -63,9 +99,14 @@ MultiResolutionAnalysis::MultiResolutionAnalysis(const MultiResolutionAnalysi } /** @returns New MultiResolutionAnalysis object - * @param[in] bb: Computational domain - * @param[in] sb: Polynomial basis + * + * @brief Constructor for a MultiResolutionAnalysis object from a pre-existing BoundingBox (computational domain) and a ScalingBasis (Multiwavelet basis) objects + * + * @param[in] bb: Computational domain as a BoundingBox object, taken by constant reference + * @param[in] sb: Polynomial basis (MW) as a ScalingBasis object * @param[in] depth: Maximum allowed resolution depth, relative to root scale + * + * @details Creates a MRA object from pre-existing BoundingBox and ScalingBasis objects. These objects are taken as reference. For more details about the constructor itself, see the first constructor. */ template MultiResolutionAnalysis::MultiResolutionAnalysis(const BoundingBox &bb, const ScalingBasis &sb, int depth) @@ -77,6 +118,16 @@ MultiResolutionAnalysis::MultiResolutionAnalysis(const BoundingBox &bb, co setupFilter(); } +/** @returns Whether the two MRA objects are equal. + * + * @brief Equality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis, computational domain and maximum depth, and false otherwise + * + * @param[in] mra: MRA object, taken by constant reference + * + * @details Equality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis represented by a BoundingBox object, computational domain (ScalingBasis object) and maximum depth (integer), and false otherwise. + * Computations on different MRA cannot be combined, this operator can be used to make sure that the multiple MRAs are compatible. + * For more information about the meaning of equality for BoundingBox and ScalingBasis objets, see their respective classes. + */ template bool MultiResolutionAnalysis::operator==(const MultiResolutionAnalysis &mra) const { if (this->basis != mra.basis) return false; if (this->world != mra.world) return false; @@ -84,13 +135,28 @@ template bool MultiResolutionAnalysis::operator==(const MultiResoluti return true; } +/** @returns Whether the two MRA objects are not equal. + * + * @brief Inequality operator for the MultiResolutionAnalysis class, returns false if both MRAs have the same polynomial basis, computational domain and maximum depth, and true otherwise + * + * @param[in] mra: MRA object, taken by constant reference + * + * @details Inequality operator for the MultiResolutionAnalysis class, returns true if both MRAs have the same polynomial basis represented by a BoundingBox object, computational domain (ScalingBasis object) and maximum depth (integer), and false otherwise. + * Opposite of the == operator. + * For more information about the meaning of equality for BoundingBox and ScalingBasis objets, see their respective classes. + */ template bool MultiResolutionAnalysis::operator!=(const MultiResolutionAnalysis &mra) const { - if (this->basis != mra.basis) return true; - if (this->world != mra.world) return true; - if (this->maxDepth != mra.maxDepth) return true; - return false; + return !(*this == mra); } +/** + * + * @brief Displays the MRA's attributes in the outstream defined in the Printer class + * + * @details This function displays the attributes of the MRA in the using the Printer class. + * By default, the Printer class writes all information in the output file, not the terminal. + * + */ template void MultiResolutionAnalysis::print() const { print::separator(0, ' '); print::header(0, "MultiResolution Analysis"); @@ -100,6 +166,15 @@ template void MultiResolutionAnalysis::print() const { print::separator(0, '=', 2); } +/** + * + * @brief Initializes the MW filters for the given MW basis. + * + * @details By calling the get() function for the appropriate MW basis, the global + * FilterCache Singleton object is initialized. Any subsequent reference to this + * particular filter will point to the same unique global object. + * + */ template void MultiResolutionAnalysis::setupFilter() { getLegendreFilterCache(lfilters); getInterpolatingFilterCache(ifilters); @@ -117,6 +192,11 @@ template void MultiResolutionAnalysis::setupFilter() { } } +/** @returns Maximum possible distance between two points in the MRA domain + * + * @brief Computes the difference between the lower and upper bounds of the computational domain + * + */ template double MultiResolutionAnalysis::calcMaxDistance() const { const Coord &lb = getWorldBox().getLowerBounds(); const Coord &ub = getWorldBox().getUpperBounds(); diff --git a/src/trees/NodeAllocator.cpp b/src/trees/NodeAllocator.cpp index 65d76b940..9ca79f0b4 100644 --- a/src/trees/NodeAllocator.cpp +++ b/src/trees/NodeAllocator.cpp @@ -253,6 +253,7 @@ template int NodeAllocator::deleteUnusedChunks() { // number of occupied chunks int nChunksTotal = getNChunks(); int nChunksUsed = getNChunksUsed(); + if(nChunksTotal == nChunksUsed) return 0; // no unused chunks assert(nChunksTotal >= nChunksUsed); for (int i = nChunksUsed; i < nChunksTotal; i++) delete[](char *)(this->nodeChunks[i]); diff --git a/src/trees/NodeAllocator.h b/src/trees/NodeAllocator.h index d16f7a22f..38e4ba7eb 100644 --- a/src/trees/NodeAllocator.h +++ b/src/trees/NodeAllocator.h @@ -56,6 +56,7 @@ template class NodeAllocator final { int compress(); void reassemble(); + int deleteUnusedChunks(); int getNNodes() const { return this->nNodes; } int getNCoefs() const { return this->coefsPerNode; } @@ -97,7 +98,6 @@ template class NodeAllocator final { void moveNodes(int nNodes, int srcIdx, int dstIdx); void appendChunk(bool coefs); - int deleteUnusedChunks(); int findNextAvailable(int sIdx, int nNodes) const; int findNextOccupied(int sIdx) const; diff --git a/src/utils/Bank.cpp b/src/utils/Bank.cpp index a70385136..a774c44ff 100644 --- a/src/utils/Bank.cpp +++ b/src/utils/Bank.cpp @@ -16,18 +16,49 @@ Bank::~Bank() { } struct Blockdata_struct { - std::vector data; // to store the incoming data - std::vector deleted; // to indicate if it has been deleted already - MatrixXd BlockData; // to put the final block - // eigen matrix are per default stored column-major (one can add columns at the end) - std::vector N_rows; + std::vector data; // to store the incoming data. One column for each orbital on the same node. + int N_rows = 0; // the number of coefficients in one column of the block. std::map id2data; // internal index of the data in the block std::vector id; // the id of each column. Either nodeid, or orbid }; -std::map *> get_nodeid2block; // to get block from its nodeid (all coeff for one node) -std::map *> get_orbid2block; // to get block from its orbid +struct OrbBlock_struct { + std::vector data; // pointer to the data + std::map id2data; // internal index of the data in the block + std::vector id; // the nodeid of the data + // note that N_rows can be different inside the same orbblock: root node have scaling and wavelets, other nodes have only wavelets +}; +struct mem_struct { + std::vector chunk_p; // vector with allocated chunks + int p = -1; // position of next available memory (not allocated if < 0) + //on Betzy 1024*1024*4 ok, 1024*1024*2 NOT ok: leads to memory fragmentation (on "Betzy" 2023) + int chunk_size = 1024*1024*4; // chunksize (in number of doubles). data_p[i]+chunk_size is end of chunk i + int account=-1; + double * get_mem(int size){ + if(p<0 or size > chunk_size or p + size > chunk_size){ //allocate new chunk of memory + if(size > 1024*1024){ + //make a special chunk just for this + double * m_p = new double[size]; + chunk_p.push_back(m_p); + p=-1; + return m_p; + } else { + double * m_p = new double[chunk_size]; + chunk_p.push_back(m_p); + p=0; + } + } + double * m_p = chunk_p[chunk_p.size()-1] + p; + p += size; + return m_p; + } +}; +std::map *> get_nodeid2block; // to get block from its nodeid (all coeff for one node) +std::map *> get_orbid2block; // to get block from its orbid + +std::map mem; int const MIN_SCALE = -999; // Smaller than smallest scale +int naccounts = 0; void Bank::open() { #ifdef MRCPP_HAS_MPI @@ -67,9 +98,10 @@ void Bank::open() { continue; } else if (message == NEW_ACCOUNT) { // we just have to pick out a number that is not already assigned - int account = (max_account_id + 1) % 1000000000 + 1; + int account = (max_account_id + 1) % 1000000000; while (get_deposits.count(account)) account = (account + 1) % 1000000000; // improbable this is used max_account_id = account; + naccounts++; // create default content get_deposits[account] = new std::vector; get_deposits[account]->resize(1); @@ -77,11 +109,13 @@ void Bank::open() { get_id2qu[account] = new std::map; get_queue[account] = new std::vector; get_queue[account]->resize(1); - get_orbid2block[account] = new std::map; - get_nodeid2block[account] = new std::map; + get_orbid2block[account] = new std::map; + get_nodeid2block[account] = new std::map; get_numberofclients[account] = messages[1]; get_readytasks[account] = new std::map>; currentsize[account] = 0; + mem[account] = new mem_struct; + mem[account]->account=account; MPI_Send(&account, 1, MPI_INT, status.MPI_SOURCE, 1, comm_bank); continue; } @@ -98,8 +132,8 @@ void Bank::open() { std::map &id2ix = *get_id2ix[account]; // gives zero if id is not defined std::map &id2qu = *get_id2qu[account]; std::vector &queue = *get_queue[account]; - std::map &orbid2block = *get_orbid2block[account]; - std::map &nodeid2block = *get_nodeid2block[account]; + std::map &orbid2block = *get_orbid2block[account]; + std::map &nodeid2block = *get_nodeid2block[account]; auto it_tasks = get_readytasks.find(account); if (it_tasks == get_readytasks.end() || it_tasks->second == nullptr) { cout << "ERROR, my account does not exist!! " << account << " " << message << endl; @@ -111,7 +145,6 @@ void Bank::open() { get_numberofclients[account]--; if (get_numberofclients[account] == 0) { // all clients have closed the account. We remove the account. - totcurrentsize -= currentsize[account]; remove_account(account); } } @@ -119,116 +152,51 @@ void Bank::open() { else if (message == CLEAR_BANK) { this->clear_bank(); for (auto const &block : nodeid2block) { - if (block.second == nullptr) continue; - for (int i = 0; i < block.second->data.size(); i++) { - if (not block.second->deleted[i]) { - currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB - totcurrentsize -= block.second->N_rows[i] / 128; // converted into kB - delete[] block.second->data[i]; - } + if (block.second.data.size() > 0) { + currentsize[account] -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB + totcurrentsize -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB } - delete block.second; } nodeid2block.clear(); orbid2block.clear(); // send message that it is ready (value of message is not used) MPI_Ssend(&message, 1, MPI_INT, status.MPI_SOURCE, 77, comm_bank); - } else if (message == CLEAR_BLOCKS) { - // clear only blocks whith id less than idmax. - int idmax = messages[2]; - std::vector toeraseVec; // it is dangerous to erase an iterator within its own loop - for (auto const &block : nodeid2block) { - if (block.second == nullptr) toeraseVec.push_back(block.first); - if (block.second == nullptr) continue; - if (block.first >= idmax and idmax != 0) continue; - for (int i = 0; i < block.second->data.size(); i++) { - if (not block.second->deleted[i]) { - currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB - totcurrentsize -= block.second->N_rows[i] / 128; // converted into kB - delete[] block.second->data[i]; - } - } - currentsize[account] -= block.second->BlockData.size() / 128; // converted into kB - totcurrentsize -= block.second->BlockData.size() / 128; // converted into kB - block.second->BlockData.resize(0, 0); // NB: the matrix does not clear itself otherwise - // assert(currentsize[account] >= 0); - this->currentsize[account] = std::max(0ll, currentsize[account]); - toeraseVec.push_back(block.first); - } - for (int ierase : toeraseVec) { nodeid2block.erase(ierase); } - toeraseVec.clear(); - std::vector datatoeraseVec; - for (auto const &block : orbid2block) { - if (block.second == nullptr) toeraseVec.push_back(block.first); - if (block.second == nullptr) continue; - datatoeraseVec.clear(); - for (int i = 0; i < block.second->data.size(); i++) { - if (block.second->id[i] < idmax or idmax == 0) datatoeraseVec.push_back(i); - if (block.second->id[i] < idmax or idmax == 0) block.second->data[i] = nullptr; - } - std::sort(datatoeraseVec.begin(), datatoeraseVec.end()); - std::reverse(datatoeraseVec.begin(), datatoeraseVec.end()); - for (int ierase : datatoeraseVec) { - block.second->id.erase(block.second->id.begin() + ierase); - block.second->data.erase(block.second->data.begin() + ierase); - block.second->N_rows.erase(block.second->N_rows.begin() + ierase); - } - if (block.second->data.size() == 0) toeraseVec.push_back(block.first); - } - for (int ierase : toeraseVec) { orbid2block.erase(ierase); } - - if (idmax == 0) orbid2block.clear(); - // could have own clear for data? - for (int ix = 1; ix < deposits.size(); ix++) { - if (deposits[ix].hasdata) delete deposits[ix].data; - if (deposits[ix].hasdata) id2ix[deposits[ix].id] = 0; // indicate that it does not exist - deposits[ix].hasdata = false; - } - // send message that it is ready (value of message is not used) - MPI_Ssend(&message, 1, MPI_INT, status.MPI_SOURCE, 78, comm_bank); } else if (message == GET_NODEDATA or message == GET_NODEBLOCK) { // NB: has no queue system yet int nodeid = messages[2]; // which block to fetch from - if (nodeid2block.count(nodeid) and nodeid2block[nodeid] != nullptr) { - Blockdata_struct *block = nodeid2block[nodeid]; + if (nodeid2block.count(nodeid)) { + Blockdata_struct &block = nodeid2block[nodeid]; int dataindex = 0; // internal index of the data in the block int size = 0; if (message == GET_NODEDATA) { int orbid = messages[3]; // which part of the block to fetch - dataindex = block->id2data[orbid]; // column of the data in the block - size = block->N_rows[dataindex]; // number of doubles to fetch + dataindex = block.id2data[orbid]; // column of the data in the block + size = block.N_rows; // number of doubles to fetch if (size != messages[4]) std::cout << "ERROR nodedata has wrong size" << std::endl; + double *data_p = block.data[dataindex]; + if (size > 0) MPI_Send(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank); } else { // send entire block. First make one contiguous superblock // Prepare the data as one contiguous block - if (block->data.size() == 0) std::cout << "Zero size blockdata! " << nodeid << " " << block->N_rows.size() << std::endl; - block->BlockData.resize(block->N_rows[0], block->data.size()); - size = block->N_rows[0] * block->data.size(); - if (printinfo) std::cout << " rewrite into superblock " << block->data.size() << " " << block->N_rows[0] << " nodeid " << nodeid << std::endl; - for (int j = 0; j < block->data.size(); j++) { - for (int i = 0; i < block->N_rows[j]; i++) { block->BlockData(i, j) = block->data[j][i]; } - } - // repoint to the data in BlockData - for (int j = 0; j < block->data.size(); j++) { - if (block->deleted[j] == true) std::cout << "ERROR data already deleted " << std::endl; - assert(block->deleted[j] == false); - delete[] block->data[j]; - block->deleted[j] = true; - block->data[j] = block->BlockData.col(j).data(); - } + if (block.data.size() == 0) std::cout << "Zero size blockdata! " << nodeid << " " << block.N_rows << std::endl; + MatrixXd DataBlock(block.N_rows, block.data.size()); + size = block.N_rows * block.data.size(); + if (printinfo) std::cout << " rewrite into superblock " << block.data.size() << " " << block.N_rows << " nodeid " << nodeid << std::endl; + for (int j = 0; j < block.data.size(); j++) { + for (int i = 0; i < block.N_rows; i++) { DataBlock(i, j) = block.data[j][i]; } + } dataindex = 0; // start from first column // send info about the size of the superblock metadata_block[0] = nodeid; // nodeid - metadata_block[1] = block->data.size(); // number of columns + metadata_block[1] = block.data.size(); // number of columns metadata_block[2] = size; // total size = rows*columns MPI_Send(metadata_block, size_metadata, MPI_INT, status.MPI_SOURCE, 1, comm_bank); // send info about the id of each column - MPI_Send(block->id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank); + MPI_Send(block.id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank); + if (size > 0) MPI_Send(DataBlock.data(), size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank); } - double *data_p = block->data[dataindex]; - if (size > 0) MPI_Send(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank); } else { if (printinfo) std::cout << " block " << nodeid << " does not exist " << std::endl; // Block with this id does not exist. @@ -253,27 +221,30 @@ void Bank::open() { // NB: BLOCKDATA has no queue system yet int orbid = messages[2]; // which block to fetch from - if (orbid2block.count(orbid) and orbid2block[orbid] != nullptr) { - Blockdata_struct *block = orbid2block[orbid]; - int dataindex = 0; // internal index of the data in the block - int size = 0; + if (orbid2block.count(orbid)) { + OrbBlock_struct &block = orbid2block[orbid]; + if (block.data.size() == 0) std::cout << "Zero size blockdata! C " << orbid << " " << std::endl; // send entire block. First make one contiguous superblock // Prepare the data as one contiguous block - if (block->data.size() == 0) std::cout << "Zero size blockdata! C " << orbid << " " << block->N_rows.size() << std::endl; - size = 0; - for (int j = 0; j < block->data.size(); j++) size += block->N_rows[j]; - + int size = 0; + for (int j = 0; j < block.data.size(); j++) { + int nodeid = block.id[j]; + int Nrows = nodeid2block[nodeid].N_rows; // note that root nodes have scaling and wavelets, while other nodes have only wavelets -> N_rows is not a constant. + size += Nrows; + } std::vector coeff(size); int ij = 0; - for (int j = 0; j < block->data.size(); j++) { - for (int i = 0; i < block->N_rows[j]; i++) { coeff[ij++] = block->data[j][i]; } + for (int j = 0; j < block.data.size(); j++) { + int nodeid = block.id[j]; + int Nrows = nodeid2block[nodeid].N_rows; + for (int i = 0; i < Nrows; i++) { coeff[ij++] = block.data[j][i]; } } // send info about the size of the superblock metadata_block[0] = orbid; - metadata_block[1] = block->data.size(); // number of columns + metadata_block[1] = block.data.size(); // number of columns metadata_block[2] = size; // total size = rows*columns MPI_Send(metadata_block, size_metadata, MPI_INT, status.MPI_SOURCE, 1, comm_bank); - MPI_Send(block->id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank); + MPI_Send(block.id.data(), metadata_block[1], MPI_INT, status.MPI_SOURCE, 2, comm_bank); MPI_Send(coeff.data(), size, MPI_DOUBLE, status.MPI_SOURCE, 3, comm_bank); } else { // it is possible and allowed that the block has not been written @@ -336,7 +307,6 @@ void Bank::open() { id2ix[id] = 0; } } - // if (message == GET_FUNCTION) { send_function(*deposits[ix].orb, status.MPI_SOURCE, 1, comm_bank); } if (message == GET_DATA) { MPI_Send(deposits[ix].data, deposits[ix].datasize, MPI_DOUBLE, status.MPI_SOURCE, 1, comm_bank); } } } else if (message == SAVE_NODEDATA) { @@ -346,38 +316,26 @@ void Bank::open() { // test if the block exists already if (printinfo) std::cout << world_rank << " save data nodeid " << nodeid << " size " << size << std::endl; - if (nodeid2block.count(nodeid) == 0 or nodeid2block[nodeid] == nullptr) { - if (printinfo) std::cout << world_rank << " block does not exist yet " << std::endl; - // the block does not exist yet, create it - Blockdata_struct *block = new Blockdata_struct; - nodeid2block[nodeid] = block; - } - if (orbid2block.count(orbid) == 0 or orbid2block[orbid] == nullptr) { - // the block does not exist yet, create it - Blockdata_struct *orbblock = new Blockdata_struct; - orbid2block[orbid] = orbblock; - } // append the incoming data - Blockdata_struct *block = nodeid2block[nodeid]; - block->id2data[orbid] = nodeid2block[nodeid]->data.size(); // internal index of the data in the block - double *data_p = new double[size]; + Blockdata_struct &block = nodeid2block[nodeid]; + block.id2data[orbid] = nodeid2block[nodeid].data.size(); // internal index of the data in the block + double *data_p = mem[account]->get_mem(size);//new double[size]; currentsize[account] += size / 128; // converted into kB totcurrentsize += size / 128; // converted into kB this->maxsize = std::max(totcurrentsize, this->maxsize); - block->data.push_back(data_p); - block->deleted.push_back(false); - block->id.push_back(orbid); - block->N_rows.push_back(size); + block.data.push_back(data_p); + block.id.push_back(orbid); + if (block.N_rows > 0 and block.N_rows != size) cout<<" ERROR block size incompatible " <id2data[nodeid] = orbblock->data.size(); // internal index of the data in the block - orbblock->data.push_back(data_p); - orbblock->deleted.push_back(false); - orbblock->id.push_back(nodeid); - orbblock->N_rows.push_back(size); + OrbBlock_struct &orbblock = orbid2block[orbid]; + orbblock.id2data[nodeid] = orbblock.data.size(); // internal index of the data in the block + orbblock.data.push_back(data_p); + orbblock.id.push_back(nodeid); + //orbblock.N_rows.push_back(size); MPI_Recv(data_p, size, MPI_DOUBLE, status.MPI_SOURCE, 1, comm_bank, &status); - if (printinfo) std::cout << " written block " << nodeid << " id " << orbid << " subblocks " << nodeid2block[nodeid]->data.size() << std::endl; + if (printinfo) std::cout << " written block " << nodeid << " id " << orbid << " subblocks " << nodeid2block[nodeid].data.size() << std::endl; } else if (message == SAVE_FUNCTION or message == SAVE_DATA) { // make a new deposit int id = messages[2]; @@ -402,7 +360,11 @@ void Bank::open() { if (message == SAVE_DATA and !deposits[ix].hasdata) { datasize = messages[3]; exist_flag = 0; - deposits[ix].data = new double[datasize]; + // deposits[ix].data = new double[datasize]; + deposits[ix].data = mem[account]->get_mem(datasize); + currentsize[account] += datasize / 128; // converted into kB + totcurrentsize += datasize / 128; // converted into kB + this->maxsize = std::max(totcurrentsize, this->maxsize); deposits[ix].hasdata = true; } } else { @@ -411,7 +373,10 @@ void Bank::open() { if (message == SAVE_FUNCTION) deposits[ix].orb = new ComplexFunction(0); if (message == SAVE_DATA) { datasize = messages[3]; - deposits[ix].data = new double[datasize]; + deposits[ix].data = mem[account]->get_mem(datasize);//new double[datasize]; + currentsize[account] += datasize / 128; // converted into kB + totcurrentsize += datasize / 128; // converted into kB + this->maxsize = std::max(totcurrentsize, this->maxsize); deposits[ix].hasdata = true; } } @@ -420,6 +385,7 @@ void Bank::open() { deposits[ix].source = status.MPI_SOURCE; if (message == SAVE_FUNCTION) { recv_function(*deposits[ix].orb, deposits[ix].source, 1, comm_bank); + cout<<"recv ORB size "<getSizeNodes(NUMBER::Total)<getSizeNodes(NUMBER::Total); totcurrentsize += deposits[ix].orb->getSizeNodes(NUMBER::Total); @@ -430,9 +396,6 @@ void Bank::open() { datasize = messages[3]; deposits[ix].datasize = datasize; MPI_Recv(deposits[ix].data, datasize, MPI_DOUBLE, deposits[ix].source, 1, comm_bank, &status); - currentsize[account] += datasize / 128; // converted into kB - totcurrentsize += datasize / 128; // converted into kB - this->maxsize = std::max(totcurrentsize, this->maxsize); } if (id2qu[deposits[ix].id] != 0) { // someone is waiting for those data. Send to them @@ -509,6 +472,7 @@ int Bank::clearAccount(int account, int iclient, MPI_Comm comm) { } void Bank::remove_account(int account) { #ifdef MRCPP_HAS_MPI + naccounts--; auto it = get_deposits.find(account); if (it == get_deposits.end() || it->second == nullptr) { cout << "ERROR, my account depositsdoes not exist!! " << account << " " << endl; @@ -516,75 +480,41 @@ void Bank::remove_account(int account) { } std::vector &deposits = *get_deposits[account]; for (int ix = 1; ix < deposits.size(); ix++) { - if (deposits[ix].orb != nullptr) deposits[ix].orb->free(NUMBER::Total); - if (deposits[ix].hasdata) delete deposits[ix].data; - deposits[ix].hasdata = false; + if (deposits[ix].orb != nullptr) deposits[ix].orb->free(NUMBER::Total); + if (deposits[ix].hasdata) { + currentsize[account] -= deposits[ix].datasize / 128; + totcurrentsize -= deposits[ix].datasize / 128; + } + if (deposits[ix].hasdata) (*get_id2ix[account])[deposits[ix].id] = 0; // indicate that it does not exist + deposits[ix].hasdata = false; } deposits.clear(); + get_deposits.erase(account); delete get_queue[account]; get_queue.erase(account); delete get_id2ix[account]; - delete get_id2qu[account]; - delete get_readytasks[account]; get_id2ix.erase(account); + delete get_id2qu[account]; get_id2qu.erase(account); - get_deposits.erase(account); - currentsize.erase(account); + delete get_readytasks[account]; get_readytasks.erase(account); - std::map &nodeid2block = *get_nodeid2block[account]; - std::map &orbid2block = *get_orbid2block[account]; + std::map &nodeid2block = *get_nodeid2block[account]; + std::map &orbid2block = *get_orbid2block[account]; - std::vector toeraseVec; // it is dangerous to erase an iterator within its own loop for (auto const &block : nodeid2block) { - if (block.second == nullptr) toeraseVec.push_back(block.first); - if (block.second == nullptr) continue; - for (int i = 0; i < block.second->data.size(); i++) { - if (not block.second->deleted[i]) { - currentsize[account] -= block.second->N_rows[i] / 128; // converted into kB - totcurrentsize -= block.second->N_rows[i] / 128; // converted into kB - delete[] block.second->data[i]; - } - } - currentsize[account] -= block.second->BlockData.size() / 128; // converted into kB - totcurrentsize -= block.second->BlockData.size() / 128; // converted into kB - block.second->BlockData.resize(0, 0); // NB: the matrix does not clear itself otherwise - // assert(currentsize[account] >= 0); - toeraseVec.push_back(block.first); - } - for (int ierase : toeraseVec) { nodeid2block.erase(ierase); } - toeraseVec.clear(); - std::vector datatoeraseVec; // it is dangerous to erase an iterator within its own loop - for (auto const &block : orbid2block) { - if (block.second == nullptr) toeraseVec.push_back(block.first); - if (block.second == nullptr) continue; - datatoeraseVec.clear(); - for (int i = 0; i < block.second->data.size(); i++) { - datatoeraseVec.push_back(i); - block.second->data[i] = nullptr; - } - std::sort(datatoeraseVec.begin(), datatoeraseVec.end()); - std::reverse(datatoeraseVec.begin(), datatoeraseVec.end()); - for (int ierase : datatoeraseVec) { - block.second->id.erase(block.second->id.begin() + ierase); - block.second->data.erase(block.second->data.begin() + ierase); - block.second->N_rows.erase(block.second->N_rows.begin() + ierase); - } - if (block.second->data.size() == 0) toeraseVec.push_back(block.first); + currentsize[account] -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB + totcurrentsize -= block.second.N_rows * block.second.data.size()/ 128; // converted into kB } - for (int ierase : toeraseVec) { orbid2block.erase(ierase); } - + nodeid2block.clear(); orbid2block.clear(); - for (int ix = 1; ix < deposits.size(); ix++) { - if (deposits[ix].hasdata) delete deposits[ix].data; - if (deposits[ix].hasdata) (*get_id2ix[account])[deposits[ix].id] = 0; // indicate that it does not exist - deposits[ix].hasdata = false; - } - delete get_nodeid2block[account]; - delete get_orbid2block[account]; + get_nodeid2block.erase(account); get_orbid2block.erase(account); + for (double* c_p : mem[account]->chunk_p) delete [] c_p; + mem.erase(account); + currentsize.erase(account); #endif } @@ -804,7 +734,6 @@ int BankAccount::put_data(NodeIndex<3> nIdx, int size, double *data) { messages[5] = nIdx.getTranslation(1); messages[6] = nIdx.getTranslation(2); int id = std::abs(nIdx.getTranslation(0) + nIdx.getTranslation(1) + nIdx.getTranslation(2)); - // std::cout< &idVec) { #ifdef MRCPP_HAS_MPI MPI_Status status; @@ -898,7 +827,7 @@ int BankAccount::get_nodeblock(int nodeid, double *data, std::vector &idVec return 1; } -// get all data with identity orbid +// get all data with identity orbid (same orbital, different nodes) int BankAccount::get_orbblock(int orbid, double *&data, std::vector &nodeidVec, int bankstart) { #ifdef MRCPP_HAS_MPI MPI_Status status; @@ -919,30 +848,6 @@ int BankAccount::get_orbblock(int orbid, double *&data, std::vector &nodeid return 1; } -// remove all blockdata with nodeid < nodeidmax -// NB:: collective call. All clients must call this -void BankAccount::clear_blockdata(int iclient, int nodeidmax, MPI_Comm comm) { -#ifdef MRCPP_HAS_MPI - // 1) wait until all clients are ready - MPI_Barrier(comm); - // master send signal to bank - if (iclient == 0) { - int messages[message_size]; - messages[0] = CLEAR_BLOCKS; - messages[1] = account_id; - messages[2] = nodeidmax; - for (int i = 0; i < bank_size; i++) { MPI_Send(messages, 3, MPI_INT, bankmaster[i], 0, comm_bank); } - for (int i = 0; i < bank_size; i++) { - // wait until Bank is finished and has sent signal - MPI_Status status; - int message; - MPI_Recv(&message, 1, MPI_INT, bankmaster[i], 78, comm_bank, &status); - } - } - MPI_Barrier(comm); -#endif -} - // creator. NB: collective BankAccount::BankAccount(int iclient, MPI_Comm comm) { this->account_id = dataBank.openAccount(iclient, comm); diff --git a/src/utils/Bank.h b/src/utils/Bank.h index 18cba950f..501faa7a0 100644 --- a/src/utils/Bank.h +++ b/src/utils/Bank.h @@ -107,7 +107,6 @@ class BankAccount { int get_nodedata(int id, int nodeid, int size, double *data, std::vector &idVec); int get_nodeblock(int nodeid, double *data, std::vector &idVec); int get_orbblock(int orbid, double *&data, std::vector &nodeidVec, int bankstart); - void clear_blockdata(int i = wrk_rank, int nodeidmax = 0, MPI_Comm comm = comm_wrk); }; class TaskManager { diff --git a/src/utils/ComplexFunction.cpp b/src/utils/ComplexFunction.cpp index 9d65cddf0..63d855727 100644 --- a/src/utils/ComplexFunction.cpp +++ b/src/utils/ComplexFunction.cpp @@ -304,7 +304,7 @@ void ComplexFunction::rescale(double c) { /** @brief In place multiply with complex scalar. Involves a deep copy.*/ void ComplexFunction::rescale(ComplexDouble c) { ComplexFunction &out = *this; - ComplexFunction tmp(isShared()); + ComplexFunction tmp(spin(), occ(), rank, isShared()); cplxfunc::deep_copy(tmp, out); out.free(NUMBER::Total); out.add(c, tmp); @@ -370,6 +370,7 @@ ComplexDouble cplxfunc::node_norm_dot(ComplexFunction bra, ComplexFunction ket, void cplxfunc::deep_copy(ComplexFunction &out, ComplexFunction &inp) { bool need_to_copy = not(out.isShared()) or mpi::share_master(); out.funcMRA = inp.funcMRA; + out.setRank(inp.getRank()); if (inp.hasReal()) { if (not out.hasReal()) out.alloc(NUMBER::Real); if (need_to_copy) { @@ -652,12 +653,25 @@ void cplxfunc::multiply_imag(ComplexFunction &out, ComplexFunction inp_a, Comple namespace mpifuncvec { -void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { + +/** @brief Make a linear combination of functions + * + * Uses "local" representation: treats one node at a time. + * For each node, all functions are transformed simultaneously + * by a dense matrix multiplication. + * Phi input functions, Psi output functions + * + */ +void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec) { // The principle of this routine is that nodes are rotated one by one using matrix multiplication. // The routine does avoid when possible to move data, but uses pointers and indices manipulation. // MPI version does not use OMP yet, Serial version uses OMP + // size of input is N, size of output is M int N = Phi.size(); + int M = Psi.size(); + if (U.rows() < N) MSG_ABORT("Incompatible number of rows for U matrix"); + if (U.cols() < M) MSG_ABORT("Incompatible number of columns for U matrix"); // 1) make union tree without coefficients FunctionTree<3> refTree(*Phi.vecMRA); @@ -674,21 +688,21 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { refTree.makeCoeffVector(coeffVec_ref, indexVec_ref, parindexVec_ref, scalefac_ref, max_ix, refTree); int max_n = indexVec_ref.size(); - // 2) We work with real numbers only. Make real blocks for U matrix + // 2) We work with real numbers only. Make real blocks for U matrix bool UhasReal = false; bool UhasImag = false; for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - if (std::abs(U(i, j).real()) > MachineZero) UhasReal = true; - if (std::abs(U(i, j).imag()) > MachineZero) UhasImag = true; + for (int j = 0; j < M; j++) { + if (std::abs(U(i, j).real()) > 10*MachineZero) UhasReal = true; + if (std::abs(U(i, j).imag()) > 10*MachineZero) UhasImag = true; } } IntVector PsihasReIm = IntVector::Zero(2); - for (int i = 0; i < N; i++) { - if (!mpi::my_orb(i)) continue; - PsihasReIm[0] = (Phi[i].hasReal()) ? 1 : 0; - PsihasReIm[1] = (Phi[i].hasImag()) ? 1 : 0; + for (int j = 0; j < N; j++) { + if (!mpi::my_orb(j)) continue; + PsihasReIm[0] = (Phi[j].hasReal()) ? 1 : 0; + PsihasReIm[1] = (Phi[j].hasImag()) ? 1 : 0; } mpi::allreduce_vector(PsihasReIm, mpi::comm_wrk); if (not PsihasReIm[0] and not PsihasReIm[1]) { @@ -698,21 +712,23 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { bool makeReal = (UhasReal and PsihasReIm[0]) or (UhasImag and PsihasReIm[1]); bool makeImag = (UhasReal and PsihasReIm[1]) or (UhasImag and PsihasReIm[0]); - for (int i = 0; i < N; i++) { - if (!mpi::my_orb(i)) continue; - if (not makeReal and Phi[i].hasReal()) Phi[i].free(NUMBER::Real); - if (not makeImag and Phi[i].hasImag()) Phi[i].free(NUMBER::Imag); + for (int j = 0; j < M; j++) { + if (!mpi::my_orb(j)) continue; + if (not makeReal and Psi[j].hasReal()) Psi[j].free(NUMBER::Real); + if (not makeImag and Psi[j].hasImag()) Psi[j].free(NUMBER::Imag); } if (not makeReal and not makeImag) { return; } - int Neff = N; // effective number of orbitals + int Neff = N; // effective number of input orbitals + int Meff = M; // effective number of output orbitals if (makeImag) Neff = 2 * N; // Imag and Real treated independently. We always use real part of U + if (makeImag) Meff = 2 * M; // Imag and Real treated independently. We always use real part of U IntVector conjMat = IntVector::Zero(Neff); - for (int i = 0; i < Neff; i++) { - if (!mpi::my_orb(i % N)) continue; - conjMat[i] = (Phi[i % N].conjugate()) ? -1 : 1; + for (int j = 0; j < Neff; j++) { + if (!mpi::my_orb(j % N)) continue; + conjMat[j] = (Phi[j % N].conjugate()) ? -1 : 1; } mpi::allreduce_vector(conjMat, mpi::comm_wrk); @@ -720,22 +736,22 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { // out_r = U_rr*in_r - U_ir*in_i*conjMat // out_i = U_ri*in_r - U_ii*in_i*conjMat // the first index of U is the one used on input Phi - DoubleMatrix Ureal(Neff, Neff); // four blocks, for rr ri ir ii - for (int i = 0; i < Neff; i++) { - for (int j = 0; j < Neff; j++) { + DoubleMatrix Ureal(Neff, Meff); // four blocks, for rr ri ir ii + for (int j = 0; j < Neff; j++) { + for (int i = 0; i < Meff; i++) { double sign = 1.0; - if (i < N and j < N) { + if (j < N and i < M) { // real U applied on real Phi - Ureal(i, j) = U.real()(i % N, j % N); - } else if (i >= N and j >= N) { + Ureal(j, i) = U.real()(j % N, i % M); + } else if (j >= N and i >= M) { // real U applied on imag Phi - Ureal(i, j) = conjMat[i] * U.real()(i % N, j % N); - } else if (i < N and j >= N) { + Ureal(j, i) = conjMat[j] * U.real()(j % N, i % M); + } else if (j < N and i >= M) { // imag U applied on real Phi - Ureal(i, j) = U.imag()(i % N, j % N); + Ureal(j, i) = U.imag()(j % N, i % M); } else { // imag U applied on imag Phi - Ureal(i, j) = -1.0 * conjMat[i] * U.imag()(i % N, j % N); + Ureal(j, i) = -1.0 * conjMat[j] * U.imag()(j % N, i % M); } } } @@ -788,21 +804,19 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { } // 4) rotate all the nodes - IntMatrix split_serial; // in the serial case all split are store in one array - std::vector split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank - std::vector needsplit(Neff, 1.0); // which orbitals need splitting - std::vector> coeffpVec(Neff); // to put pointers to the rotated coefficient for each orbital in serial case - std::vector> ix2coef(Neff); // to find the index in for example rotCoeffVec[] corresponding to a serialIx + IntMatrix split_serial; // in the serial case all split are stored in one array + std::vector> coeffpVec(Meff); // to put pointers to the rotated coefficient for each orbital in serial case + std::vector> ix2coef(Meff); // to find the index in for example rotCoeffVec[] corresponding to a serialIx int csize; // size of the current coefficients (different for roots and branches) std::vector rotatedCoeffVec; // just to ensure that the data from rotatedCoeff is not deleted, since we point to it. // j indices are for unrotated orbitals, i indices are for rotated orbitals if (serial) { std::map ix2coef_ref; // to find the index n corresponding to a serialIx - split_serial.resize(Neff, max_n); // not use in the MPI case + split_serial.resize(Meff, max_n); // not use in the MPI case for (int n = 0; n < max_n; n++) { int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree ix2coef_ref[node_ix] = n; - for (int i = 0; i < Neff; i++) split_serial(i, n) = 1; + for (int i = 0; i < Meff; i++) split_serial(i, n) = 1; } std::vector nodeReady(max_n, 0); // To indicate to OMP threads that the parent is ready (for splits) @@ -836,9 +850,9 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { }; std::vector orbiVec; - for (int i = 0; i < Neff; i++) { // loop over all rotated orbitals - if (not makeReal and i < N) continue; - if (not makeImag and i >= N) continue; + for (int i = 0; i < Meff; i++) { // loop over all rotated orbitals + if (not makeReal and i < M) continue; + if (not makeImag and i >= M) continue; if (parindexVec_ref[n] >= 0 and split_serial(i, ix2coef_ref[parindexVec_ref[n]]) == 0) continue; // parent node has too small wavelets orbiVec.push_back(i); } @@ -886,12 +900,11 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { rotatedCoeffVec.push_back(std::move(rotatedCoeff)); } } - } else { // MPI case // TODO? rotate in bank, so that we do not get and put. Requires clever handling of splits. - std::vector split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank - std::vector needsplit(Neff, 1.0); // which orbitals need splitting + std::vector split(Meff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank + std::vector needsplit(Meff, 1.0); // which orbitals need splitting BankAccount nodeSplits; mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet. @@ -906,13 +919,13 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { int parentid = parindexVec_ref[n]; if (parentid == -1) { // root node, split if output needed - for (int i = 0; i < N; i++) { + for (int i = 0; i < M; i++) { if (makeReal) split[i] = 1.0; else split[i] = -1.0; } - for (int i = N; i < Neff; i++) { + for (int i = N; i < Meff; i++) { if (makeImag) split[i] = 1.0; else @@ -921,12 +934,12 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { csize = sizecoeff; } else { // note that it will wait until data is available - nodeSplits.get_data(parentid, Neff, split.data()); + nodeSplits.get_data(parentid, Meff, split.data()); csize = sizecoeffW; } std::vector orbiVec; std::vector orbjVec; - for (int i = 0; i < Neff; i++) { // loop over rotated orbitals + for (int i = 0; i < Meff; i++) { // loop over rotated orbitals if (split[i] < 0.0) continue; // parent node has too small wavelets orbiVec.push_back(i); } @@ -960,7 +973,7 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { if (thres < wnorm or prec < 0) needsplit[orbiVec[i]] = 1.0; nodesRotated.put_nodedata(orbiVec[i], indexVec_ref[n] + max_ix, csize, rotatedCoeff.col(i).data()); } - nodeSplits.put_data(indexVec_ref[n], Neff, needsplit.data()); + nodeSplits.put_data(indexVec_ref[n], Meff, needsplit.data()); } mpi::barrier(mpi::comm_wrk); // wait until all rotated nodes are ready } @@ -973,22 +986,23 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { // operation is writing the coefficient into the tree) #pragma omp parallel for schedule(static) - for (int j = 0; j < Neff; j++) { - if (j < N) { - if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real); - Phi[j].real().clear(); - Phi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); + for (int j = 0; j < Meff; j++) { + if (coeffpVec[j].size()==0) continue; + if (j < M) { + if (!Psi[j].hasReal()) Psi[j].alloc(NUMBER::Real); + Psi[j].real().clear(); + Psi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); } else { - if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag); - Phi[j % N].imag().clear(); - Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); + if (!Psi[j % M].hasImag()) Psi[j % M].alloc(NUMBER::Imag); + Psi[j % M].imag().clear(); + Psi[j % M].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); } } } else { // MPI case - for (int j = 0; j < Neff; j++) { - if (not mpi::my_orb(j % N)) continue; + for (int j = 0; j < Meff; j++) { + if (not mpi::my_orb(j % M)) continue; // traverse possible nodes, and stop descending when norm is zero (leaf in out[j]) std::vector coeffpVec; // std::map ix2coef; // to find the index in coeffVec[] corresponding to a serialIx @@ -1010,16 +1024,16 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { shift += csize; } } - if (j < N) { + if (j < M) { // Real part - if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real); - Phi[j].real().clear(); - Phi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); + if (!Psi[j].hasReal()) Psi[j].alloc(NUMBER::Real); + Psi[j].real().clear(); + Psi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); } else { // Imag part - if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag); - Phi[j % N].imag().clear(); - Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); + if (!Psi[j % M].hasImag()) Psi[j % M].alloc(NUMBER::Imag); + Psi[j % M].imag().clear(); + Psi[j % M].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); } for (double *p : pointerstodelete) delete[] p; pointerstodelete.clear(); @@ -1027,6 +1041,12 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { } } + +void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { + rotate(Phi, U, Phi, prec); + return; +} + /** @brief Save all nodes in bank; identify them using serialIx from refTree * shift is a shift applied in the id */ @@ -1551,10 +1571,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet) { S_temp.noalias() = coeffBlock.transpose() * coeffBlock; for (int i = 0; i < orbVec.size(); i++) { for (int j = 0; j < orbVec.size(); j++) { - // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) - // continue; - // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) - // continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) + continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) + continue; double &Srealij = Sreal(orbVec[i], orbVec[j]); double &Stempij = S_temp(i, j); #pragma omp atomic @@ -1572,17 +1592,16 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet) { S_temp.noalias() = coeffBlock.transpose() * coeffBlock; for (int i = 0; i < orbVec.size(); i++) { for (int j = 0; j < orbVec.size(); j++) { - // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) - // continue; - // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) - // continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) + continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) + continue; Sreal(orbVec[i], orbVec[j]) += S_temp(i, j); } } } } } - IntVector conjMat = IntVector::Zero(N); for (int i = 0; i < N; i++) { if (!mrcpp::mpi::my_orb(BraKet[i])) continue; @@ -1716,7 +1735,8 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) { int totget = 0; int mxtotsiz = 0; int ibank = 0; -#pragma omp parallel for schedule(dynamic) if (serial) + //For some unknown reason the h2_mag_lda test sometimes fails when schedule(dynamic) is chosen +#pragma omp parallel for schedule(static) if (serial) for (int n = 0; n < max_n; n++) { if (n % mrcpp::mpi::wrk_size != mrcpp::mpi::wrk_rank) continue; int csize; @@ -1749,10 +1769,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) { S_temp.noalias() = coeffBlockBra.transpose() * coeffBlockKet; for (int i = 0; i < orbVecBra.size(); i++) { for (int j = 0; j < orbVecKet.size(); j++) { - // if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta) - // continue; - // if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha) - // continue; + if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta) + continue; + if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha) + continue; // must ensure that threads are not competing double &Srealij = Sreal(orbVecBra[i], orbVecKet[j]); double &Stempij = S_temp(i, j); @@ -1761,7 +1781,6 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) { } } } - } else { DoubleMatrix coeffBlockBra(csize, 2 * N); @@ -1778,10 +1797,10 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) { S_temp.noalias() = coeffBlockBra.transpose() * coeffBlockKet; for (int i = 0; i < orbVecBra.size(); i++) { for (int j = 0; j < orbVecKet.size(); j++) { - // if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta) - // continue; - // if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha) - // continue; + if (Bra[orbVecBra[i] % N].spin() == SPIN::Alpha and Ket[orbVecKet[j] % M].spin() == SPIN::Beta) + continue; + if (Bra[orbVecBra[i] % N].spin() == SPIN::Beta and Ket[orbVecKet[j] % M].spin() == SPIN::Alpha) + continue; Sreal(orbVecBra[i], orbVecKet[j]) += S_temp(i, j); } } @@ -1813,12 +1832,6 @@ ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket) { mrcpp::mpi::allreduce_matrix(S, mrcpp::mpi::comm_wrk); - conjMatKet[0] = totsiz; - conjMatKet[1] = mxtotsiz; - conjMatKet[2] = totget; - - mrcpp::mpi::allreduce_vector(conjMatKet, mrcpp::mpi::comm_wrk); - return S; } @@ -1922,10 +1935,10 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) { S_temp.noalias() = coeffBlock.transpose() * coeffBlock; for (int i = 0; i < orbVec.size(); i++) { for (int j = 0; j < orbVec.size(); j++) { - // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) - // continue; - // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) - // continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) + continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) + continue; double &Srealij = Sreal(orbVec[i], orbVec[j]); double &Stempij = S_temp(i, j); #pragma omp atomic @@ -1944,10 +1957,10 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) { S_temp.noalias() = coeffBlock.transpose() * coeffBlock; for (int i = 0; i < orbVec.size(); i++) { for (int j = 0; j < orbVec.size(); j++) { - // if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) - // continue; - // if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) - // continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Alpha and BraKet[orbVec[j] % N].spin() == SPIN::Beta) + continue; + if (BraKet[orbVec[i] % N].spin() == SPIN::Beta and BraKet[orbVec[j] % N].spin() == SPIN::Alpha) + continue; Sreal(orbVec[i], orbVec[j]) += S_temp(i, j); } } @@ -1971,9 +1984,33 @@ DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet) { // Assumes linearity: result is sum of all nodes contributions mrcpp::mpi::allreduce_matrix(S, mrcpp::mpi::comm_wrk); - return S; } +/** @brief Orthogonalize the functions in Bra against all orbitals in Ket + * + */ +void orthogonalize(double prec, MPI_FuncVector &Bra, MPI_FuncVector &Ket) { + // TODO: generalize for cases where Ket functions are not orthogonal to each other? + ComplexMatrix S = mpifuncvec::calc_overlap_matrix(Bra, Ket); + int N = Bra.size(); + int M = Ket.size(); + DoubleVector Ketnorms = DoubleVector::Zero(M); + for (int i = 0; i < M; i++) { + if (mpi::my_orb(Ket[i])) Ketnorms(i) = Ket[i].squaredNorm(); + } + mrcpp::mpi::allreduce_vector(Ketnorms, mrcpp::mpi::comm_wrk); + ComplexMatrix rmat = ComplexMatrix::Zero(M, N); + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + rmat(i,j) = 0.0 - S.conjugate()(j,i)/Ketnorms(i); + } + } + MPI_FuncVector rotatedKet(N); + mpifuncvec::rotate(Ket, rmat, rotatedKet, prec / M); + for (int j = 0; j < N; j++) { + if(my_orb(Bra[j]))Bra[j].add(1.0,rotatedKet[j]); + } +} } // namespace mpifuncvec } // namespace mrcpp diff --git a/src/utils/ComplexFunction.h b/src/utils/ComplexFunction.h index 8fbd78bc5..cf33cadfb 100644 --- a/src/utils/ComplexFunction.h +++ b/src/utils/ComplexFunction.h @@ -186,6 +186,7 @@ class MPI_FuncVector : public std::vector { namespace mpifuncvec { void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec = -1.0); +void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec = -1.0); void save_nodes(MPI_FuncVector &Phi, mrcpp::FunctionTree<3> &refTree, BankAccount &account, int sizes = -1); MPI_FuncVector multiply(MPI_FuncVector &Phi, RepresentableFunction<3> &f, double prec = -1.0, ComplexFunction *Func = nullptr, int nrefine = 1, bool all = false); ComplexVector dot(MPI_FuncVector &Bra, MPI_FuncVector &Ket); @@ -193,5 +194,6 @@ ComplexMatrix calc_lowdin_matrix(MPI_FuncVector &Phi); ComplexMatrix calc_overlap_matrix(MPI_FuncVector &BraKet); ComplexMatrix calc_overlap_matrix(MPI_FuncVector &Bra, MPI_FuncVector &Ket); DoubleMatrix calc_norm_overlap_matrix(MPI_FuncVector &BraKet); +void orthogonalize(double prec, MPI_FuncVector &Bra, MPI_FuncVector &Ket); } // namespace mpifuncvec } // namespace mrcpp diff --git a/src/utils/mpi_utils.h b/src/utils/mpi_utils.h index c2efb0c21..1b94b0dc9 100644 --- a/src/utils/mpi_utils.h +++ b/src/utils/mpi_utils.h @@ -51,6 +51,7 @@ extern int sh_group_rank; extern int is_bank; extern int is_bankclient; extern int bank_size; +extern int omp_threads; extern int tot_bank_size; extern int max_tag; extern int task_bank; diff --git a/src/utils/omp_utils.cpp b/src/utils/omp_utils.cpp index 0ea4d5bdc..67ccd3069 100644 --- a/src/utils/omp_utils.cpp +++ b/src/utils/omp_utils.cpp @@ -24,19 +24,18 @@ */ #include "omp_utils.h" - +#include namespace mrcpp { // By default we get OMP_NUM_THREADS int max_threads = mrcpp_get_max_threads(); void set_max_threads(int threads) { - auto omp_max = mrcpp_get_max_threads(); - if (threads < 1 or threads > omp_max) { - mrcpp::max_threads = omp_max; - } else { - mrcpp::max_threads = threads; - } + threads = std::max(1, threads); + mrcpp::max_threads = threads; +#ifdef MRCPP_HAS_OMP + omp_set_num_threads(threads); +#endif } } // namespace mrcpp diff --git a/src/utils/parallel.cpp b/src/utils/parallel.cpp index 5198ac307..2877d12e9 100644 --- a/src/utils/parallel.cpp +++ b/src/utils/parallel.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "Bank.h" #include "ComplexFunction.h" @@ -11,15 +12,18 @@ #ifdef MRCPP_HAS_OMP #define mrcpp_get_max_threads() omp_get_max_threads() +#define mrcpp_get_num_procs() omp_get_num_procs()/2 #define mrcpp_set_dynamic(n) omp_set_dynamic(n) #else #define mrcpp_get_max_threads() 1 +#define mrcpp_get_num_procs() 1 #define mrcpp_get_num_threads() 1 #define mrcpp_get_thread_num() 0 #define mrcpp_set_dynamic(n) #endif using mrcpp::Printer; +using namespace std; namespace mrcpp { @@ -51,9 +55,10 @@ int is_centralbank = 0; int is_bankclient = 1; int is_bankmaster = 0; // only one bankmaster is_bankmaster int bank_size = 0; +int omp_threads = -1; // can be set to force number of threads int tot_bank_size = 0; // size of bank, including the task manager int max_tag = 0; // max value allowed by MPI -std::vector bankmaster; +vector bankmaster; int task_bank = -1; // world rank of the task manager MPI_Comm comm_wrk; @@ -71,7 +76,6 @@ extern int const size_metadata = 3; void mpi::initialize() { Eigen::setNbThreads(1); mrcpp_set_dynamic(0); - mrcpp::set_max_threads(omp::n_threads); #ifdef MRCPP_HAS_MPI MPI_Init(nullptr, nullptr); @@ -90,7 +94,7 @@ void mpi::initialize() { if (mpi::world_size < 2) { mpi::bank_size = 0; } else if (mpi::bank_size < 0) { - mpi::bank_size = std::max(mpi::world_size / 3, 1); + mpi::bank_size = max(mpi::world_size / 3, 1); } if (mpi::world_size - mpi::bank_size < 1) MSG_ABORT("No MPI ranks left for working!"); if (mpi::bank_size < 1 and mpi::world_size > 1) MSG_ABORT("Bank size must be at least one when using MPI!"); @@ -154,6 +158,62 @@ void mpi::initialize() { max_tag = *(int *)val / 2; id_shift = max_tag / 2; // half is reserved for non orbital. + // determine the number of threads we can assign to each mpi worker. + // mrcpp_get_num_procs is total number of hardware logical threads accessible by this mpi + // We assume that half of them are physical cores. + // mrcpp_get_max_threads is OMP_NUM_THREADS (environment variable). + // omp_threads_available is the total number of logical threads available on this compute-node + // We assume that half of them are physical cores. + // + // six conditions should be satisfied: + // 1) no one use more than mrcpp_get_num_procs()/2 + // 2) NOT ENFORCED: no one use more than mrcpp_get_max_threads, as defined by rank 0 + // 3) the total number of threads used on the compute-node must not exceed omp_threads_available/2 + // 4) Bank needs only one thread + // 5) workers need as many threads as possible + // 6) at least one thread + + MPI_Comm comm_share_world;//all that share the memory + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_share_world); + + int n_bank_thisnode; //number of banks on this node + MPI_Allreduce(&is_bank, &n_bank_thisnode, 1, MPI_INT, MPI_SUM, comm_share_world); + int n_wrk_thisnode; //number of workers on this node + MPI_Allreduce(&is_bankclient, &n_wrk_thisnode, 1, MPI_INT, MPI_SUM, comm_share_world); + + int omp_threads_available = thread::hardware_concurrency(); + int nthreads = 1; + if (is_bankclient) nthreads = (omp_threads_available/2-n_bank_thisnode)/n_wrk_thisnode; // 3) and 5) + + // do not exceed total number of cores accessible (assumed to be half the number of logical threads) + nthreads = min(nthreads, mrcpp_get_num_procs()); // 1) + + // NB: we do not use OMP_NUM_THREADS. Use all cores accessible. Could change this in the future + // if OMP_NUM_THREADS is set, do not exceed + // we enforce that all compute nodes use the same OMP_NUM_THREADS. Rank 0 decides. + /* int my_OMP_NUM_THREADS = mrcpp_get_max_threads(); + MPI_Bcast(&my_OMP_NUM_THREADS, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (my_OMP_NUM_THREADS > 0) nthreads = min(nthreads, my_OMP_NUM_THREADS); // 2) + */ + + nthreads = max(1, nthreads); // 6) + + if (is_bank) nthreads = 1; // 4) + + //cout< 0) { + if (omp_threads != nthreads and world_rank == 0) { + cout<<"Warning: recommended number of threads is "< &tree, MPI_Comm comm) { /** @brief make union tree without coeff and send to all * Include both real and imaginary parts */ -void mpi::allreduce_Tree_noCoeff(mrcpp::FunctionTree<3> &tree, std::vector &Phi, MPI_Comm comm) { +void mpi::allreduce_Tree_noCoeff(mrcpp::FunctionTree<3> &tree, vector &Phi, MPI_Comm comm) { /* 1) make union grid of own orbitals 2) make union grid with others orbitals (sent to rank zero) 3) rank zero broadcast func to everybody diff --git a/tests/trees/tree_io.cpp b/tests/trees/tree_io.cpp index 12af6f42a..b193382da 100644 --- a/tests/trees/tree_io.cpp +++ b/tests/trees/tree_io.cpp @@ -37,7 +37,7 @@ namespace tree_io { template void testProjectFunction(); SCENARIO("FunctionTree IO", "[tree_io], [trees]") { - const double prec = 1.0e-4; + const double prec = 1.0e-6; GaussFunc<3> *func = nullptr; initialize(&func);