From 93c504a45c6fe8efa1173670682d1ec23c925d69 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Fri, 16 Aug 2024 04:07:28 -0600 Subject: [PATCH 1/4] MueLu: Move Kokkos version of MapsAreNested Signed-off-by: Christian Glusa --- .../src/Utils/MueLu_UtilitiesBase_def.hpp | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 6e181415b09b..db4975471e59 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -2028,22 +2028,27 @@ UtilitiesBase:: template bool UtilitiesBase:: MapsAreNested(const Xpetra::Map& rowMap, const Xpetra::Map& colMap) { - ArrayView rowElements = rowMap.getLocalElementList(); - ArrayView colElements = colMap.getLocalElementList(); + using execution_space = typename Node::execution_space; + using range_type = Kokkos::RangePolicy; - const size_t numElements = rowElements.size(); + auto rowLocalMap = rowMap.getLocalMap(); + auto colLocalMap = colMap.getLocalMap(); - if (size_t(colElements.size()) < numElements) + const size_t numRows = rowLocalMap.getLocalNumElements(); + const size_t numCols = colLocalMap.getLocalNumElements(); + + if (numCols < numRows) return false; - bool goodMap = true; - for (size_t i = 0; i < numElements; i++) - if (rowElements[i] != colElements[i]) { - goodMap = false; - break; - } + size_t numDiff = 0; + Kokkos::parallel_reduce( + "MueLu:TentativePF:isGoodMap", range_type(0, numRows), + KOKKOS_LAMBDA(const LO i, size_t& diff) { + diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i)); + }, + numDiff); - return goodMap; + return (numDiff == 0); } template From 1d9fb4d48d014efd659a1ab68bfb7f8d250e92a5 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Fri, 16 Aug 2024 04:08:29 -0600 Subject: [PATCH 2/4] MueLu: Reshuffle TentativePFactory* to make diff easier Signed-off-by: Christian Glusa --- .../Smoothed-Aggregation/MueLu_LocalQR.hpp | 322 ++++ .../MueLu_TentativePFactory_decl.hpp | 15 +- .../MueLu_TentativePFactory_def.hpp | 1438 +++++++++-------- .../MueLu_TentativePFactory_kokkos_decl.hpp | 54 +- .../MueLu_TentativePFactory_kokkos_def.hpp | 419 +---- 5 files changed, 1125 insertions(+), 1123 deletions(-) create mode 100644 packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp new file mode 100644 index 000000000000..733b16dc2c9c --- /dev/null +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp @@ -0,0 +1,322 @@ +#ifndef MUELU_LOCALQR_HPP +#define MUELU_LOCALQR_HPP + +#include +#include +#include "Xpetra_ConfigDefs.hpp" + +namespace MueLu::LocalQR { + +template +class ReduceMaxFunctor { + public: + ReduceMaxFunctor(View view) + : view_(view) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const { + if (vmax < view_(i)) + vmax = view_(i); + } + + KOKKOS_INLINE_FUNCTION + void join(LocalOrdinal& dst, const LocalOrdinal& src) const { + if (dst < src) { + dst = src; + } + } + + KOKKOS_INLINE_FUNCTION + void init(LocalOrdinal& dst) const { + dst = 0; + } + + private: + View view_; +}; + +// local QR decomposition +template +class LocalQRDecompFunctor { + private: + using LO = LOType; + using GO = GOType; + using SC = SCType; + + using execution_space = typename DeviceType::execution_space; + using impl_SC = typename Kokkos::ArithTraits::val_type; + using impl_ATS = Kokkos::ArithTraits; + using Magnitude = typename impl_ATS::magnitudeType; + + using shared_matrix = Kokkos::View; + using shared_vector = Kokkos::View; + + NspType fineNS; + NspType coarseNS; + aggRowsType aggRows; + maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode) + agg2RowMapLOType agg2RowMapLO; + statusType statusAtomic; + rowsType rows; + rowsAuxType rowsAux; + colsAuxType colsAux; + valsAuxType valsAux; + bool doQRStep; + + public: + LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) + : fineNS(fineNS_) + , coarseNS(coarseNS_) + , aggRows(aggRows_) + , maxAggDofSize(maxAggDofSize_) + , agg2RowMapLO(agg2RowMapLO_) + , statusAtomic(statusAtomic_) + , rows(rows_) + , rowsAux(rowsAux_) + , colsAux(colsAux_) + , valsAux(valsAux_) + , doQRStep(doQRStep_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& thread, size_t& nnz) const { + auto agg = thread.league_rank(); + + // size of aggregate: number of DOFs in aggregate + auto aggSize = aggRows(agg + 1) - aggRows(agg); + + const impl_SC one = impl_ATS::one(); + const impl_SC two = one + one; + const impl_SC zero = impl_ATS::zero(); + const auto zeroM = impl_ATS::magnitude(zero); + + int m = aggSize; + int n = fineNS.extent(1); + + // calculate row offset for coarse nullspace + Xpetra::global_size_t offset = agg * n; + + if (doQRStep) { + // Extract the piece of the nullspace corresponding to the aggregate + shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end) + for (int j = 0; j < n; j++) + for (int k = 0; k < m; k++) + r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j); +#if 0 + printf("A\n"); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) + printf(" %5.3lf ", r(i,j)); + printf("\n"); + } +#endif + + // Calculate QR decomposition (standard) + shared_matrix q(thread.team_shmem(), m, m); // Q + if (m >= n) { + bool isSingular = false; + + // Initialize Q^T + auto qt = q; + for (int i = 0; i < m; i++) { + for (int j = 0; j < m; j++) + qt(i, j) = zero; + qt(i, i) = one; + } + + for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize + // FIXME_KOKKOS: use team + Magnitude s = zeroM; + Magnitude norm; + Magnitude norm_x; + for (int i = k + 1; i < m; i++) + s += pow(impl_ATS::magnitude(r(i, k)), 2); + norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); + + if (norm == zero) { + isSingular = true; + break; + } + + r(k, k) -= norm * one; + + norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); + if (norm_x == zeroM) { + // We have a single diagonal element in the column. + // No reflections required. Just need to restor r(k,k). + r(k, k) = norm * one; + continue; + } + + // FIXME_KOKKOS: use team + for (int i = k; i < m; i++) + r(i, k) /= norm_x; + + // Update R(k:m,k+1:n) + for (int j = k + 1; j < n; j++) { + // FIXME_KOKKOS: use team in the loops + impl_SC si = zero; + for (int i = k; i < m; i++) + si += r(i, k) * r(i, j); + for (int i = k; i < m; i++) + r(i, j) -= two * si * r(i, k); + } + + // Update Q^T (k:m,k:m) + for (int j = k; j < m; j++) { + // FIXME_KOKKOS: use team in the loops + impl_SC si = zero; + for (int i = k; i < m; i++) + si += r(i, k) * qt(i, j); + for (int i = k; i < m; i++) + qt(i, j) -= two * si * r(i, k); + } + + // Fix R(k:m,k) + r(k, k) = norm * one; + for (int i = k + 1; i < m; i++) + r(i, k) = zero; + } + +#if 0 + // Q = (Q^T)^T + for (int i = 0; i < m; i++) + for (int j = 0; j < i; j++) { + impl_SC tmp = qt(i,j); + qt(i,j) = qt(j,i); + qt(j,i) = tmp; + } +#endif + + // Build coarse nullspace using the upper triangular part of R + for (int j = 0; j < n; j++) + for (int k = 0; k <= j; k++) + coarseNS(offset + k, j) = r(k, j); + + if (isSingular) { + statusAtomic(1) = true; + return; + } + + } else { + // Special handling for m < n (i.e. single node aggregates in structural mechanics) + + // The local QR decomposition is not possible in the "overconstrained" + // case (i.e. number of columns in qr > number of rowsAux), which + // corresponds to #DOFs in Aggregate < n. For usual problems this + // is only possible for single node aggregates in structural mechanics. + // (Similar problems may arise in discontinuous Galerkin problems...) + // We bypass the QR decomposition and use an identity block in the + // tentative prolongator for the single node aggregate and transfer the + // corresponding fine level null space information 1-to-1 to the coarse + // level null space part. + + // NOTE: The resulting tentative prolongation operator has + // (m*DofsPerNode-n) zero columns leading to a singular + // coarse level operator A. To deal with that one has the following + // options: + // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: + // false) to add some identity block to the diagonal of the zero rowsAux + // in the coarse level operator A, such that standard level smoothers + // can be used again. + // - Use special (projection-based) level smoothers, which can deal + // with singular matrices (very application specific) + // - Adapt the code below to avoid zero columns. However, we do not + // support a variable number of DOFs per node in MueLu/Xpetra which + // makes the implementation really hard. + // + // FIXME: do we need to check for singularity here somehow? Zero + // columns would be easy but linear dependency would require proper QR. + + // R = extended (by adding identity rowsAux) qr + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + if (k < m) + coarseNS(offset + k, j) = r(k, j); + else + coarseNS(offset + k, j) = (k == j ? one : zero); + + // Q = I (rectangular) + for (int i = 0; i < m; i++) + for (int j = 0; j < n; j++) + q(i, j) = (j == i ? one : zero); + } + + // Process each row in the local Q factor and fill helper arrays to assemble P + for (int j = 0; j < m; j++) { + LO localRow = agg2RowMapLO(aggRows(agg) + j); + size_t rowStart = rowsAux(localRow); + size_t lnnz = 0; + for (int k = 0; k < n; k++) { + // skip zeros + if (q(j, k) != zero) { + colsAux(rowStart + lnnz) = offset + k; + valsAux(rowStart + lnnz) = q(j, k); + lnnz++; + } + } + rows(localRow + 1) = lnnz; + nnz += lnnz; + } + +#if 0 + printf("R\n"); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) + printf(" %5.3lf ", coarseNS(i,j)); + printf("\n"); + } + + printf("Q\n"); + for (int i = 0; i < aggSize; i++) { + for (int j = 0; j < aggSize; j++) + printf(" %5.3lf ", q(i,j)); + printf("\n"); + } +#endif + } else { + ///////////////////////////// + // "no-QR" option // + ///////////////////////////// + // Local Q factor is just the fine nullspace support over the current aggregate. + // Local R factor is the identity. + // TODO I have not implemented any special handling for aggregates that are too + // TODO small to locally support the nullspace, as is done in the standard QR + // TODO case above. + + for (int j = 0; j < m; j++) { + LO localRow = agg2RowMapLO(aggRows(agg) + j); + size_t rowStart = rowsAux(localRow); + size_t lnnz = 0; + for (int k = 0; k < n; k++) { + const impl_SC qr_jk = fineNS(localRow, k); + // skip zeros + if (qr_jk != zero) { + colsAux(rowStart + lnnz) = offset + k; + valsAux(rowStart + lnnz) = qr_jk; + lnnz++; + } + } + rows(localRow + 1) = lnnz; + nnz += lnnz; + } + + for (int j = 0; j < n; j++) + coarseNS(offset + j, j) = one; + } + } + + // amount of shared memory + size_t team_shmem_size(int /* team_size */) const { + if (doQRStep) { + int m = maxAggDofSize; + int n = fineNS.extent(1); + return shared_matrix::shmem_size(m, n) + // r + shared_matrix::shmem_size(m, m); // q + } else + return 0; + } +}; + +} // namespace MueLu::LocalQR + +#endif diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp index 9c7df7f04811..ebf4bb49e192 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp @@ -26,12 +26,9 @@ #include #include "MueLu_ConfigDefs.hpp" -#include "MueLu_TentativePFactory_fwd.hpp" #include "MueLu_Aggregates_fwd.hpp" -#include "MueLu_AmalgamationFactory_fwd.hpp" #include "MueLu_AmalgamationInfo_fwd.hpp" -#include "MueLu_CoarseMapFactory_fwd.hpp" #include "MueLu_Level_fwd.hpp" #include "MueLu_PerfUtils_fwd.hpp" #include "MueLu_PFactory.hpp" @@ -91,26 +88,26 @@ class TentativePFactory : public PFactory { //@{ //! Constructor - TentativePFactory() {} + TentativePFactory(); //! Destructor. - virtual ~TentativePFactory() {} + ~TentativePFactory() override; //@} - RCP GetValidParameterList() const; + RCP GetValidParameterList() const override; //! Input //@{ - void DeclareInput(Level& fineLevel, Level& coarseLevel) const; + void DeclareInput(Level& fineLevel, Level& coarseLevel) const override; //@} //! @name Build methods. //@{ - void Build(Level& fineLevel, Level& coarseLevel) const; - void BuildP(Level& fineLevel, Level& coarseLevel) const; + void Build(Level& fineLevel, Level& coarseLevel) const override; + void BuildP(Level& fineLevel, Level& coarseLevel) const override; //@} diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp index 220db70dc868..2a701af1014c 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp @@ -24,7 +24,6 @@ #include #include #include -#include #include "Xpetra_TpetraBlockCrsMatrix.hpp" @@ -39,6 +38,12 @@ namespace MueLu { +template +TentativePFactory::TentativePFactory() = default; + +template +TentativePFactory::~TentativePFactory() = default; + template RCP TentativePFactory::GetValidParameterList() const { RCP validParamList = rcp(new ParameterList()); @@ -50,14 +55,14 @@ RCP TentativePFactoryset("Nullspace name", "Nullspace", "Name for the input nullspace"); - validParamList->set >("A", Teuchos::null, "Generating factory of the matrix A"); - validParamList->set >("Aggregates", Teuchos::null, "Generating factory of the aggregates"); - validParamList->set >("Nullspace", Teuchos::null, "Generating factory of the nullspace"); - validParamList->set >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace"); - validParamList->set >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo"); - validParamList->set >("CoarseMap", Teuchos::null, "Generating factory of the coarse map"); - validParamList->set >("Coordinates", Teuchos::null, "Generating factory of the coordinates"); - validParamList->set >("Node Comm", Teuchos::null, "Generating factory of the node level communicator"); + validParamList->set>("A", Teuchos::null, "Generating factory of the matrix A"); + validParamList->set>("Aggregates", Teuchos::null, "Generating factory of the aggregates"); + validParamList->set>("Nullspace", Teuchos::null, "Generating factory of the nullspace"); + validParamList->set>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace"); + validParamList->set>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo"); + validParamList->set>("CoarseMap", Teuchos::null, "Generating factory of the coarse map"); + validParamList->set>("Coordinates", Teuchos::null, "Generating factory of the coordinates"); + validParamList->set>("Node Comm", Teuchos::null, "Generating factory of the node level communicator"); // Make sure we don't recursively validate options for the matrixmatrix kernels ParameterList norecurse; @@ -97,6 +102,7 @@ void TentativePFactory::Build(Level& template void TentativePFactory::BuildP(Level& fineLevel, Level& coarseLevel) const { FactoryMonitor m(*this, "Build", coarseLevel); + typedef typename Teuchos::ScalarTraits::coordinateType coordinate_type; typedef Xpetra::MultiVector RealValuedMultiVector; typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; @@ -106,8 +112,8 @@ void TentativePFactory::BuildP(Level& if (pL.isParameter("Nullspace name")) nspName = pL.get("Nullspace name"); RCP Ptentative; - RCP A = Get >(fineLevel, "A"); - RCP aggregates = Get >(fineLevel, "Aggregates"); + auto A = Get>(fineLevel, "A"); + auto aggregates = Get>(fineLevel, "Aggregates"); // No coarse DoFs so we need to bail by setting Ptentattive to null and returning // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize() if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) { @@ -116,18 +122,18 @@ void TentativePFactory::BuildP(Level& return; } - RCP amalgInfo = Get >(fineLevel, "UnAmalgamationInfo"); - RCP fineNullspace = Get >(fineLevel, nspName); - RCP coarseMap = Get >(fineLevel, "CoarseMap"); + auto amalgInfo = Get>(fineLevel, "UnAmalgamationInfo"); + auto fineNullspace = Get>(fineLevel, nspName); + auto coarseMap = Get>(fineLevel, "CoarseMap"); RCP fineCoords; if (bTransferCoordinates_) { - fineCoords = Get >(fineLevel, "Coordinates"); + fineCoords = Get>(fineLevel, "Coordinates"); } // FIXME: We should remove the NodeComm on levels past the threshold if (fineLevel.IsAvailable("Node Comm")) { - RCP > nodeComm = Get > >(fineLevel, "Node Comm"); - Set > >(coarseLevel, "Node Comm", nodeComm); + RCP> nodeComm = Get>>(fineLevel, "Node Comm"); + Set>>(coarseLevel, "Node Comm", nodeComm); } // NOTE: We check DomainMap here rather than RowMap because those are different for BlockCrs matrices @@ -235,25 +241,14 @@ void TentativePFactory::BuildP(Level& template void TentativePFactory:: - BuildPuncoupledBlockCrs(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, - RCP coarsePointMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { - /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could - be generalized later, if we ever need to do so: - 1) Null space dimension === block size of matrix: So no elasticity right now - 2) QR is not supported: Under assumption #1, this shouldn't cause problems. - 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap. - - These assumptions keep our code way simpler and still support the use cases we actually care about. - */ - - RCP rowMap = A->getRowMap(); - RCP rangeMap = A->getRangeMap(); - RCP colMap = A->getColMap(); - // const size_t numFinePointRows = rangeMap->getLocalNumElements(); - const size_t numFineBlockRows = rowMap->getLocalNumElements(); + BuildPuncoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { + RCP rowMap = A->getRowMap(); + RCP colMap = A->getColMap(); + const size_t numRows = rowMap->getLocalNumElements(); typedef Teuchos::ScalarTraits STS; - // typedef typename STS::magnitudeType Magnitude; + typedef typename STS::magnitudeType Magnitude; const SC zero = STS::zero(); const SC one = STS::one(); const LO INVALID = Teuchos::OrdinalTraits::invalid(); @@ -262,16 +257,6 @@ void TentativePFactory:: const size_t NSDim = fineNullspace->getNumVectors(); ArrayRCP aggSizes = aggregates->ComputeAggregateSizesArrayRCP(); - // Need to generate the coarse block map - // NOTE: We assume NSDim == block size here - // NOTE: We also assume that coarseMap has contiguous GIDs - // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements(); - const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim; - RCP coarseBlockMap = MapFactory::Build(coarsePointMap->lib(), - Teuchos::OrdinalTraits::invalid(), - numCoarseBlockRows, - coarsePointMap->getIndexBase(), - coarsePointMap->getComm()); // Sanity checking const ParameterList& pL = GetParameterList(); const bool& doQRStep = pL.get("tentative: calculate qr"); @@ -280,8 +265,6 @@ void TentativePFactory:: TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums, Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time"); - // The aggregates use the amalgamated column map, which in this case is what we want - // Aggregates map is based on the amalgamated column map // We can skip global-to-local conversion if LIDs in row map are // same as LIDs in column map @@ -297,508 +280,309 @@ void TentativePFactory:: if (goodMap) { amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO); GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl; + } else { - throw std::runtime_error("TentativePFactory::PuncoupledBlockCrs: Inconsistent maps not currently supported"); + amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO); + GetOStream(Warnings0) << "Column map is not consistent with the row map\n" + << "using GO->LO conversion with performance penalty" << std::endl; } - - coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim); + coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); // Pull out the nullspace vectors so that we can have random access. - ArrayRCP > fineNS(NSDim); - ArrayRCP > coarseNS(NSDim); + ArrayRCP> fineNS(NSDim); + ArrayRCP> coarseNS(NSDim); for (size_t i = 0; i < NSDim; i++) { fineNS[i] = fineNullspace->getData(i); - if (coarsePointMap->getLocalNumElements() > 0) + if (coarseMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); } - // BlockCrs requires that we build the (block) graph first, so let's do that... - // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one - // block non-zero per row in the matrix; - RCP BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, 0); + size_t nnzEstimate = numRows * NSDim; + + // Time to construct the matrix and fill in the values + Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0)); + RCP PtentCrs = rcp_dynamic_cast(Ptentative)->getCrsMatrix(); + ArrayRCP iaPtent; ArrayRCP jaPtent; - BlockGraph->allocateAllIndices(numFineBlockRows, iaPtent, jaPtent); + ArrayRCP valPtent; + + PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent); + ArrayView ia = iaPtent(); ArrayView ja = jaPtent(); + ArrayView val = valPtent(); - for (size_t i = 0; i < numFineBlockRows; i++) { - ia[i] = i; - ja[i] = INVALID; + ia[0] = 0; + for (size_t i = 1; i <= numRows; i++) + ia[i] = ia[i - 1] + NSDim; + + for (size_t j = 0; j < nnzEstimate; j++) { + ja[j] = INVALID; + val[j] = zero; } - ia[numCoarseBlockRows] = numCoarseBlockRows; - for (GO agg = 0; agg < numAggs; agg++) { - LO aggSize = aggStart[agg + 1] - aggStart[agg]; - Xpetra::global_size_t offset = agg; + if (doQRStep) { + //////////////////////////////// + // Standard aggregate-wise QR // + //////////////////////////////// + for (GO agg = 0; agg < numAggs; agg++) { + LO aggSize = aggStart[agg + 1] - aggStart[agg]; - for (LO j = 0; j < aggSize; j++) { - // FIXME: Allow for bad maps - const LO localRow = aggToRowMapLO[aggStart[agg] + j]; - const size_t rowStart = ia[localRow]; - ja[rowStart] = offset; - } - } + Xpetra::global_size_t offset = agg * NSDim; - // Compress storage (remove all INVALID, which happen when we skip zeros) - // We do that in-place - size_t ia_tmp = 0, nnz = 0; - for (size_t i = 0; i < numFineBlockRows; i++) { - for (size_t j = ia_tmp; j < ia[i + 1]; j++) - if (ja[j] != INVALID) { - ja[nnz] = ja[j]; - nnz++; + // Extract the piece of the nullspace corresponding to the aggregate, and + // put it in the flat array, "localQR" (in column major format) for the + // QR routine. + Teuchos::SerialDenseMatrix localQR(aggSize, NSDim); + if (goodMap) { + for (size_t j = 0; j < NSDim; j++) + for (LO k = 0; k < aggSize; k++) + localQR(k, j) = fineNS[j][aggToRowMapLO[aggStart[agg] + k]]; + } else { + for (size_t j = 0; j < NSDim; j++) + for (LO k = 0; k < aggSize; k++) + localQR(k, j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + k])]; } - ia_tmp = ia[i + 1]; - ia[i + 1] = nnz; - } - - if (rowMap->lib() == Xpetra::UseTpetra) { - // - Cannot resize for Epetra, as it checks for same pointers - // - Need to resize for Tpetra, as it check ().size() == ia[numRows] - // NOTE: these invalidate ja and val views - jaPtent.resize(nnz); - } - GetOStream(Runtime1) << "TentativePFactory : generating block graph" << std::endl; - BlockGraph->setAllIndices(iaPtent, jaPtent); + // Test for zero columns + for (size_t j = 0; j < NSDim; j++) { + bool bIsZeroNSColumn = true; - // Managing labels & constants for ESFC - { - RCP FCparams; - if (pL.isSublist("matrixmatrix: kernel params")) - FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); - else - FCparams = rcp(new ParameterList); - // By default, we don't need global constants for TentativeP, but we do want it for the graph - // if we're printing statistics, so let's leave it on for now. - FCparams->set("compute global constants", FCparams->get("compute global constants", true)); - std::string levelIDs = toString(levelID); - FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs); - RCP dummy_e; - RCP dummy_i; - BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams); - } + for (LO k = 0; k < aggSize; k++) + if (localQR(k, j) != zero) + bIsZeroNSColumn = false; - // Now let's make a BlockCrs Matrix - // NOTE: Assumes block size== NSDim - RCP > P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim); - RCP > P_tpetra = rcp_dynamic_cast >(P_xpetra); - if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix"); - RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, + "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j); + } - ///////////////////////////// - // "no-QR" option // - ///////////////////////////// - // Local Q factor is just the fine nullspace support over the current aggregate. - // Local R factor is the identity. - // NOTE: We're not going to do a QR here as we're assuming that blocksize == NSDim - // NOTE: "goodMap" case only - Teuchos::Array block(NSDim * NSDim, zero); - Teuchos::Array bcol(1); + // Calculate QR decomposition (standard) + // NOTE: Q is stored in localQR and R is stored in coarseNS + if (aggSize >= Teuchos::as(NSDim)) { + if (NSDim == 1) { + // Only one nullspace vector, calculate Q and R by hand + Magnitude norm = STS::magnitude(zero); + for (size_t k = 0; k < Teuchos::as(aggSize); k++) + norm += STS::magnitude(localQR(k, 0) * localQR(k, 0)); + norm = Teuchos::ScalarTraits::squareroot(norm); - GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; - for (LO agg = 0; agg < numAggs; agg++) { - bcol[0] = agg; - const LO aggSize = aggStart[agg + 1] - aggStart[agg]; - Xpetra::global_size_t offset = agg * NSDim; + // R = norm + coarseNS[0][offset] = norm; - // Process each row in the local Q factor - // NOTE: Blocks are in row-major order - for (LO j = 0; j < aggSize; j++) { - const LO localBlockRow = aggToRowMapLO[aggStart[agg] + j]; + // Q = localQR(:,0)/norm + for (LO i = 0; i < aggSize; i++) + localQR(i, 0) /= norm; - for (size_t r = 0; r < NSDim; r++) { - LO localPointRow = localBlockRow * NSDim + r; - for (size_t c = 0; c < NSDim; c++) - block[r * NSDim + c] = fineNS[c][localPointRow]; - } - // NOTE: Assumes columns==aggs and are ordered sequentially - P_tpetra->replaceLocalValues(localBlockRow, bcol(), block()); + } else { + Teuchos::SerialQRDenseSolver qrSolver; + qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); + qrSolver.factor(); - } // end aggSize + // R = upper triangular part of localQR + for (size_t j = 0; j < NSDim; j++) + for (size_t k = 0; k <= j; k++) + coarseNS[j][offset + k] = localQR(k, j); // TODO is offset+k the correct local ID?! - for (size_t j = 0; j < NSDim; j++) - coarseNS[j][offset + j] = one; + // Calculate Q, the tentative prolongator. + // The Lapack GEQRF call only works for myAggsize >= NSDim + qrSolver.formQ(); + Teuchos::RCP> qFactor = qrSolver.getQ(); + for (size_t j = 0; j < NSDim; j++) + for (size_t i = 0; i < Teuchos::as(aggSize); i++) + localQR(i, j) = (*qFactor)(i, j); + } - } // for (GO agg = 0; agg < numAggs; agg++) + } else { + // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics) - Ptentative = P_wrap; -} + // The local QR decomposition is not possible in the "overconstrained" + // case (i.e. number of columns in localQR > number of rows), which + // corresponds to #DOFs in Aggregate < NSDim. For usual problems this + // is only possible for single node aggregates in structural mechanics. + // (Similar problems may arise in discontinuous Galerkin problems...) + // We bypass the QR decomposition and use an identity block in the + // tentative prolongator for the single node aggregate and transfer the + // corresponding fine level null space information 1-to-1 to the coarse + // level null space part. -template -void TentativePFactory:: - BuildPcoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType Magnitude; - const SC zero = STS::zero(); - const SC one = STS::one(); + // NOTE: The resulting tentative prolongation operator has + // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular + // coarse level operator A. To deal with that one has the following + // options: + // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: + // false) to add some identity block to the diagonal of the zero rows + // in the coarse level operator A, such that standard level smoothers + // can be used again. + // - Use special (projection-based) level smoothers, which can deal + // with singular matrices (very application specific) + // - Adapt the code below to avoid zero columns. However, we do not + // support a variable number of DOFs per node in MueLu/Xpetra which + // makes the implementation really hard. - // number of aggregates - GO numAggs = aggregates->GetNumAggregates(); + // R = extended (by adding identity rows) localQR + for (size_t j = 0; j < NSDim; j++) + for (size_t k = 0; k < NSDim; k++) + if (k < as(aggSize)) + coarseNS[j][offset + k] = localQR(k, j); + else + coarseNS[j][offset + k] = (k == j ? one : zero); - // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate. - // aggStart is a pointer into aggToRowMap - // aggStart[i]..aggStart[i+1] are indices into aggToRowMap - // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i - ArrayRCP aggStart; - ArrayRCP aggToRowMap; - amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap); - - // find size of the largest aggregate - LO maxAggSize = 0; - for (GO i = 0; i < numAggs; ++i) { - LO sizeOfThisAgg = aggStart[i + 1] - aggStart[i]; - if (sizeOfThisAgg > maxAggSize) maxAggSize = sizeOfThisAgg; - } - - // dimension of fine level nullspace - const size_t NSDim = fineNullspace->getNumVectors(); - - // index base for coarse Dof map (usually 0) - GO indexBase = A->getRowMap()->getIndexBase(); - - const RCP nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates); - const RCP uniqueMap = A->getDomainMap(); - RCP importer = ImportFactory::Build(uniqueMap, nonUniqueMap); - RCP fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap, NSDim); - fineNullspaceWithOverlap->doImport(*fineNullspace, *importer, Xpetra::INSERT); - - // Pull out the nullspace vectors so that we can have random access. - ArrayRCP > fineNS(NSDim); - for (size_t i = 0; i < NSDim; ++i) - fineNS[i] = fineNullspaceWithOverlap->getData(i); - - // Allocate storage for the coarse nullspace. - coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); - - ArrayRCP > coarseNS(NSDim); - for (size_t i = 0; i < NSDim; ++i) - if (coarseMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); - - // This makes the rowmap of Ptent the same as that of A-> - // This requires moving some parts of some local Q's to other processors - // because aggregates can span processors. - RCP rowMapForPtent = A->getRowMap(); - const Map& rowMapForPtentRef = *rowMapForPtent; - - // Set up storage for the rows of the local Qs that belong to other processors. - // FIXME This is inefficient and could be done within the main loop below with std::vector's. - RCP colMap = A->getColMap(); - - RCP ghostQMap; - RCP ghostQvalues; - Array > > ghostQcolumns; - RCP > ghostQrowNums; - ArrayRCP > ghostQvals; - ArrayRCP > ghostQcols; - ArrayRCP ghostQrows; - - Array ghostGIDs; - for (LO j = 0; j < numAggs; ++j) { - for (LO k = aggStart[j]; k < aggStart[j + 1]; ++k) { - if (rowMapForPtentRef.isNodeGlobalElement(aggToRowMap[k]) == false) { - ghostGIDs.push_back(aggToRowMap[k]); + // Q = I (rectangular) + for (size_t i = 0; i < as(aggSize); i++) + for (size_t j = 0; j < NSDim; j++) + localQR(i, j) = (j == i ? one : zero); } - } - } - ghostQMap = MapFactory::Build(A->getRowMap()->lib(), - Teuchos::OrdinalTraits::invalid(), - ghostGIDs, - indexBase, A->getRowMap()->getComm()); // JG:Xpetra::global_size_t>? - // Vector to hold bits of Q that go to other processors. - ghostQvalues = MultiVectorFactory::Build(ghostQMap, NSDim); - // Note that Epetra does not support MultiVectors templated on Scalar != double. - // So to work around this, we allocate an array of Vectors. This shouldn't be too - // expensive, as the number of Vectors is NSDim. - ghostQcolumns.resize(NSDim); - for (size_t i = 0; i < NSDim; ++i) - ghostQcolumns[i] = Xpetra::VectorFactory::Build(ghostQMap); - ghostQrowNums = Xpetra::VectorFactory::Build(ghostQMap); - if (ghostQvalues->getLocalLength() > 0) { - ghostQvals.resize(NSDim); - ghostQcols.resize(NSDim); - for (size_t i = 0; i < NSDim; ++i) { - ghostQvals[i] = ghostQvalues->getDataNonConst(i); - ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0); - } - ghostQrows = ghostQrowNums->getDataNonConst(0); - } - - // importer to handle moving Q - importer = ImportFactory::Build(ghostQMap, A->getRowMap()); - - // Dense QR solver - Teuchos::SerialQRDenseSolver qrSolver; - - // Allocate temporary storage for the tentative prolongator. - Array globalColPtr(maxAggSize * NSDim, 0); - Array localColPtr(maxAggSize * NSDim, 0); - Array valPtr(maxAggSize * NSDim, 0.); - - // Create column map for Ptent, estimate local #nonzeros in Ptent, and create Ptent itself. - const Map& coarseMapRef = *coarseMap; - - // For the 3-arrays constructor - ArrayRCP ptent_rowptr; - ArrayRCP ptent_colind; - ArrayRCP ptent_values; - - // Because ArrayRCPs are slow... - ArrayView rowptr_v; - ArrayView colind_v; - ArrayView values_v; - - // For temporary usage - Array rowptr_temp; - Array colind_temp; - Array values_temp; - - RCP PtentCrs; - - RCP PtentCrsWrap = rcp(new CrsMatrixWrap(rowMapForPtent, NSDim)); - PtentCrs = PtentCrsWrap->getCrsMatrix(); - Ptentative = PtentCrsWrap; - - //***************************************************************** - // Loop over all aggregates and calculate local QR decompositions. - //***************************************************************** - GO qctr = 0; // for indexing into Ptent data vectors - const Map& nonUniqueMapRef = *nonUniqueMap; - - size_t total_nnz_count = 0; - - for (GO agg = 0; agg < numAggs; ++agg) { - LO myAggSize = aggStart[agg + 1] - aggStart[agg]; - // For each aggregate, extract the corresponding piece of the nullspace and put it in the flat array, - // "localQR" (in column major format) for the QR routine. - Teuchos::SerialDenseMatrix localQR(myAggSize, NSDim); - for (size_t j = 0; j < NSDim; ++j) { - bool bIsZeroNSColumn = true; - for (LO k = 0; k < myAggSize; ++k) { - // aggToRowMap[aggPtr[i]+k] is the kth DOF in the ith aggregate - // fineNS[j][n] is the nth entry in the jth NS vector - try { - SC nsVal = fineNS[j][nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k])]; // extract information from fine level NS - localQR(k, j) = nsVal; - if (nsVal != zero) bIsZeroNSColumn = false; - } catch (...) { - GetOStream(Runtime1, -1) << "length of fine level nsp: " << fineNullspace->getGlobalLength() << std::endl; - GetOStream(Runtime1, -1) << "length of fine level nsp w overlap: " << fineNullspaceWithOverlap->getGlobalLength() << std::endl; - GetOStream(Runtime1, -1) << "(local?) aggId=" << agg << std::endl; - GetOStream(Runtime1, -1) << "aggSize=" << myAggSize << std::endl; - GetOStream(Runtime1, -1) << "agg DOF=" << k << std::endl; - GetOStream(Runtime1, -1) << "NS vector j=" << j << std::endl; - GetOStream(Runtime1, -1) << "j*myAggSize + k = " << j * myAggSize + k << std::endl; - GetOStream(Runtime1, -1) << "aggToRowMap[" << agg << "][" << k << "] = " << aggToRowMap[aggStart[agg] + k] << std::endl; - GetOStream(Runtime1, -1) << "id aggToRowMap[agg][k]=" << aggToRowMap[aggStart[agg] + k] << " is global element in nonUniqueMap = " << nonUniqueMapRef.isNodeGlobalElement(aggToRowMap[aggStart[agg] + k]) << std::endl; - GetOStream(Runtime1, -1) << "colMap local id aggToRowMap[agg][k]=" << nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k]) << std::endl; - GetOStream(Runtime1, -1) << "fineNS...=" << fineNS[j][nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k])] << std::endl; - GetOStream(Errors, -1) << "caught an error!" << std::endl; - } - } // for (LO k=0 ... - TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column. Error."); - } // for (LO j=0 ... - Xpetra::global_size_t offset = agg * NSDim; - - if (myAggSize >= Teuchos::as(NSDim)) { - // calculate QR decomposition (standard) - // R is stored in localQR (size: myAggSize x NSDim) - - // Householder multiplier - SC tau = localQR(0, 0); - - if (NSDim == 1) { - // Only one nullspace vector, so normalize by hand - Magnitude dtemp = 0; - for (size_t k = 0; k < Teuchos::as(myAggSize); ++k) { - Magnitude tmag = STS::magnitude(localQR(k, 0)); - dtemp += tmag * tmag; - } - dtemp = Teuchos::ScalarTraits::squareroot(dtemp); - tau = localQR(0, 0); - localQR(0, 0) = dtemp; - } else { - qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); - qrSolver.factor(); - } + // Process each row in the local Q factor + // FIXME: What happens if maps are blocked? + for (LO j = 0; j < aggSize; j++) { + LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg] + j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j])); - // Extract R, the coarse nullspace. This is stored in upper triangular part of localQR. - // Note: coarseNS[i][.] is the ith coarse nullspace vector, which may be counter to your intuition. - // This stores the (offset+k)th entry only if it is local according to the coarseMap. - for (size_t j = 0; j < NSDim; ++j) { - for (size_t k = 0; k <= j; ++k) { - try { - if (coarseMapRef.isNodeLocalElement(offset + k)) { - coarseNS[j][offset + k] = localQR(k, j); // TODO is offset+k the correct local ID?! - } - } catch (...) { - GetOStream(Errors, -1) << "caught error in coarseNS insert, j=" << j << ", offset+k = " << offset + k << std::endl; + size_t rowStart = ia[localRow]; + for (size_t k = 0, lnnz = 0; k < NSDim; k++) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + if (localQR(j, k) != zero) { + ja[rowStart + lnnz] = offset + k; + val[rowStart + lnnz] = localQR(j, k); + lnnz++; } } } + } - // Calculate Q, the tentative prolongator. - // The Lapack GEQRF call only works for myAggsize >= NSDim + } else { + GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; + if (NSDim > 1) + GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl; + ///////////////////////////// + // "no-QR" option // + ///////////////////////////// + // Local Q factor is just the fine nullspace support over the current aggregate. + // Local R factor is the identity. + // TODO I have not implemented any special handling for aggregates that are too + // TODO small to locally support the nullspace, as is done in the standard QR + // TODO case above. + if (goodMap) { + for (GO agg = 0; agg < numAggs; agg++) { + const LO aggSize = aggStart[agg + 1] - aggStart[agg]; + Xpetra::global_size_t offset = agg * NSDim; - if (NSDim == 1) { - // Only one nullspace vector, so calculate Q by hand - Magnitude dtemp = Teuchos::ScalarTraits::magnitude(localQR(0, 0)); - localQR(0, 0) = tau; - dtemp = 1 / dtemp; - for (LocalOrdinal i = 0; i < myAggSize; ++i) { - localQR(i, 0) *= dtemp; - } - } else { - qrSolver.formQ(); - Teuchos::RCP > qFactor = qrSolver.getQ(); - for (size_t j = 0; j < NSDim; j++) { - for (size_t i = 0; i < Teuchos::as(myAggSize); i++) { - localQR(i, j) = (*qFactor)(i, j); - } - } - } + // Process each row in the local Q factor + // FIXME: What happens if maps are blocked? + for (LO j = 0; j < aggSize; j++) { + // TODO Here I do not check for a zero nullspace column on the aggregate. + // as is done in the standard QR case. - // end default case (myAggSize >= NSDim) - } else { // special handling for myAggSize < NSDim (i.e. 1pt nodes) - // See comments for the uncoupled case + const LO localRow = aggToRowMapLO[aggStart[agg] + j]; - // R = extended (by adding identity rows) localQR - for (size_t j = 0; j < NSDim; j++) - for (size_t k = 0; k < NSDim; k++) { - TEUCHOS_TEST_FOR_EXCEPTION(!coarseMapRef.isNodeLocalElement(offset + k), Exceptions::RuntimeError, - "Caught error in coarseNS insert, j=" << j << ", offset+k = " << offset + k); + const size_t rowStart = ia[localRow]; - if (k < as(myAggSize)) - coarseNS[j][offset + k] = localQR(k, j); - else - coarseNS[j][offset + k] = (k == j ? one : zero); + for (size_t k = 0, lnnz = 0; k < NSDim; k++) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg] + j]]; + if (constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; + if (qr_jk != zero) { + ja[rowStart + lnnz] = offset + k; + val[rowStart + lnnz] = qr_jk; + lnnz++; + } + } } - - // Q = I (rectangular) - for (size_t i = 0; i < as(myAggSize); i++) for (size_t j = 0; j < NSDim; j++) - localQR(i, j) = (j == i ? one : zero); - } // end else (special handling for 1pt aggregates) + coarseNS[j][offset + j] = one; + } // for (GO agg = 0; agg < numAggs; agg++) - // Process each row in the local Q factor. If the row is local to the current processor - // according to the rowmap, insert it into Ptentative. Otherwise, save it in ghostQ - // to be communicated later to the owning processor. - // FIXME -- what happens if maps are blocked? - for (GO j = 0; j < myAggSize; ++j) { - // This loop checks whether row associated with current DOF is local, according to rowMapForPtent. - // If it is, the row is inserted. If not, the row number, columns, and values are saved in - // MultiVectors that will be sent to other processors. - GO globalRow = aggToRowMap[aggStart[agg] + j]; + } else { + for (GO agg = 0; agg < numAggs; agg++) { + const LO aggSize = aggStart[agg + 1] - aggStart[agg]; + Xpetra::global_size_t offset = agg * NSDim; + for (LO j = 0; j < aggSize; j++) { + const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j]); - // TODO is the use of Xpetra::global_size_t below correct? - if (rowMapForPtentRef.isNodeGlobalElement(globalRow) == false) { - ghostQrows[qctr] = globalRow; - for (size_t k = 0; k < NSDim; ++k) { - ghostQcols[k][qctr] = coarseMapRef.getGlobalElement(agg * NSDim + k); - ghostQvals[k][qctr] = localQR(j, k); - } - ++qctr; - } else { - size_t nnz = 0; - for (size_t k = 0; k < NSDim; ++k) { - try { - if (localQR(j, k) != Teuchos::ScalarTraits::zero()) { - localColPtr[nnz] = agg * NSDim + k; - globalColPtr[nnz] = coarseMapRef.getGlobalElement(localColPtr[nnz]); - valPtr[nnz] = localQR(j, k); - ++total_nnz_count; - ++nnz; + const size_t rowStart = ia[localRow]; + + for (size_t k = 0, lnnz = 0; k < NSDim; ++k) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j])]; + if (constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; + if (qr_jk != zero) { + ja[rowStart + lnnz] = offset + k; + val[rowStart + lnnz] = qr_jk; + lnnz++; } - } catch (...) { - GetOStream(Errors, -1) << "caught error in colPtr/valPtr insert, current index=" << nnz << std::endl; } - } // for (size_t k=0; kinsertGlobalValues(globalRow, globalColPtr.view(0, nnz), valPtr.view(0, nnz)); - } catch (...) { - GetOStream(Errors, -1) << "pid " << A->getRowMap()->getComm()->getRank() - << "caught error during Ptent row insertion, global row " - << globalRow << std::endl; } - } - } // for (GO j=0; j > targetQrowNums = Xpetra::VectorFactory::Build(rowMapForPtent); - targetQrowNums->putScalar(-1); - targetQrowNums->doImport(*ghostQrowNums, *importer, Xpetra::INSERT); - ArrayRCP targetQrows = targetQrowNums->getDataNonConst(0); + for (size_t j = 0; j < NSDim; j++) + coarseNS[j][offset + j] = one; + } // for (GO agg = 0; agg < numAggs; agg++) - // Now create map based on just the row numbers imported. - Array gidsToImport; - gidsToImport.reserve(targetQrows.size()); - for (typename ArrayRCP::iterator r = targetQrows.begin(); r != targetQrows.end(); ++r) { - if (*r > -1) { - gidsToImport.push_back(*r); - } - } - RCP reducedMap = MapFactory::Build(A->getRowMap()->lib(), - Teuchos::OrdinalTraits::invalid(), - gidsToImport, indexBase, A->getRowMap()->getComm()); + } // if (goodmap) else ... - // Import using the row numbers that this processor will receive. - importer = ImportFactory::Build(ghostQMap, reducedMap); + } // if doQRStep ... else - Array > > targetQcolumns(NSDim); - for (size_t i = 0; i < NSDim; ++i) { - targetQcolumns[i] = Xpetra::VectorFactory::Build(reducedMap); - targetQcolumns[i]->doImport(*(ghostQcolumns[i]), *importer, Xpetra::INSERT); + // Compress storage (remove all INVALID, which happen when we skip zeros) + // We do that in-place + size_t ia_tmp = 0, nnz = 0; + for (size_t i = 0; i < numRows; i++) { + for (size_t j = ia_tmp; j < ia[i + 1]; j++) + if (ja[j] != INVALID) { + ja[nnz] = ja[j]; + val[nnz] = val[j]; + nnz++; + } + ia_tmp = ia[i + 1]; + ia[i + 1] = nnz; } - RCP targetQvalues = MultiVectorFactory::Build(reducedMap, NSDim); - targetQvalues->doImport(*ghostQvalues, *importer, Xpetra::INSERT); - - ArrayRCP > targetQvals; - ArrayRCP > targetQcols; - if (targetQvalues->getLocalLength() > 0) { - targetQvals.resize(NSDim); - targetQcols.resize(NSDim); - for (size_t i = 0; i < NSDim; ++i) { - targetQvals[i] = targetQvalues->getDataNonConst(i); - targetQcols[i] = targetQcolumns[i]->getDataNonConst(0); - } + if (rowMap->lib() == Xpetra::UseTpetra) { + // - Cannot resize for Epetra, as it checks for same pointers + // - Need to resize for Tpetra, as it check ().size() == ia[numRows] + // NOTE: these invalidate ja and val views + jaPtent.resize(nnz); + valPtent.resize(nnz); } - valPtr = Array(NSDim, 0.); - globalColPtr = Array(NSDim, 0); - for (typename Array::iterator r = gidsToImport.begin(); r != gidsToImport.end(); ++r) { - if (targetQvalues->getLocalLength() > 0) { - for (size_t j = 0; j < NSDim; ++j) { - valPtr[j] = targetQvals[j][reducedMap->getLocalElement(*r)]; - globalColPtr[j] = targetQcols[j][reducedMap->getLocalElement(*r)]; - } - Ptentative->insertGlobalValues(*r, globalColPtr.view(0, NSDim), valPtr.view(0, NSDim)); - } // if (targetQvalues->getLocalLength() > 0) - } + GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl; - Ptentative->fillComplete(coarseMap, A->getDomainMap()); + PtentCrs->setAllValues(iaPtent, jaPtent, valPtent); + + // Managing labels & constants for ESFC + RCP FCparams; + if (pL.isSublist("matrixmatrix: kernel params")) + FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); + else + FCparams = rcp(new ParameterList); + // By default, we don't need global constants for TentativeP + FCparams->set("compute global constants", FCparams->get("compute global constants", false)); + std::string levelIDs = toString(levelID); + FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs); + RCP dummy_e; + RCP dummy_i; + + PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(), dummy_i, dummy_e, FCparams); } template void TentativePFactory:: - BuildPuncoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { - RCP rowMap = A->getRowMap(); - RCP colMap = A->getColMap(); - const size_t numRows = rowMap->getLocalNumElements(); + BuildPuncoupledBlockCrs(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarsePointMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { + /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could + be generalized later, if we ever need to do so: + 1) Null space dimension === block size of matrix: So no elasticity right now + 2) QR is not supported: Under assumption #1, this shouldn't cause problems. + 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap. + + These assumptions keep our code way simpler and still support the use cases we actually care about. + */ + + RCP rowMap = A->getRowMap(); + RCP rangeMap = A->getRangeMap(); + RCP colMap = A->getColMap(); + // const size_t numFinePointRows = rangeMap->getLocalNumElements(); + const size_t numFineBlockRows = rowMap->getLocalNumElements(); typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType Magnitude; + // typedef typename STS::magnitudeType Magnitude; const SC zero = STS::zero(); const SC one = STS::one(); const LO INVALID = Teuchos::OrdinalTraits::invalid(); @@ -807,6 +591,16 @@ void TentativePFactory:: const size_t NSDim = fineNullspace->getNumVectors(); ArrayRCP aggSizes = aggregates->ComputeAggregateSizesArrayRCP(); + // Need to generate the coarse block map + // NOTE: We assume NSDim == block size here + // NOTE: We also assume that coarseMap has contiguous GIDs + // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements(); + const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim; + RCP coarseBlockMap = MapFactory::Build(coarsePointMap->lib(), + Teuchos::OrdinalTraits::invalid(), + numCoarseBlockRows, + coarsePointMap->getIndexBase(), + coarsePointMap->getComm()); // Sanity checking const ParameterList& pL = GetParameterList(); const bool& doQRStep = pL.get("tentative: calculate qr"); @@ -815,6 +609,8 @@ void TentativePFactory:: TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums, Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time"); + // The aggregates use the amalgamated column map, which in this case is what we want + // Aggregates map is based on the amalgamated column map // We can skip global-to-local conversion if LIDs in row map are // same as LIDs in column map @@ -825,291 +621,501 @@ void TentativePFactory:: // aggStart[i]..aggStart[i+1] are indices into aggToRowMapLO // aggToRowMapLO[aggStart[i]]..aggToRowMapLO[aggStart[i+1]-1] are the DOFs in aggregate i ArrayRCP aggStart; - ArrayRCP aggToRowMapLO; - ArrayRCP aggToRowMapGO; - if (goodMap) { - amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO); - GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl; + ArrayRCP aggToRowMapLO; + ArrayRCP aggToRowMapGO; + if (goodMap) { + amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO); + GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl; + } else { + throw std::runtime_error("TentativePFactory::PuncoupledBlockCrs: Inconsistent maps not currently supported"); + } + + coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim); + + // Pull out the nullspace vectors so that we can have random access. + ArrayRCP> fineNS(NSDim); + ArrayRCP> coarseNS(NSDim); + for (size_t i = 0; i < NSDim; i++) { + fineNS[i] = fineNullspace->getData(i); + if (coarsePointMap->getLocalNumElements() > 0) + coarseNS[i] = coarseNullspace->getDataNonConst(i); + } + + // BlockCrs requires that we build the (block) graph first, so let's do that... + // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one + // block non-zero per row in the matrix; + RCP BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, 0); + ArrayRCP iaPtent; + ArrayRCP jaPtent; + BlockGraph->allocateAllIndices(numFineBlockRows, iaPtent, jaPtent); + ArrayView ia = iaPtent(); + ArrayView ja = jaPtent(); + + for (size_t i = 0; i < numFineBlockRows; i++) { + ia[i] = i; + ja[i] = INVALID; + } + ia[numCoarseBlockRows] = numCoarseBlockRows; + + for (GO agg = 0; agg < numAggs; agg++) { + LO aggSize = aggStart[agg + 1] - aggStart[agg]; + Xpetra::global_size_t offset = agg; + + for (LO j = 0; j < aggSize; j++) { + // FIXME: Allow for bad maps + const LO localRow = aggToRowMapLO[aggStart[agg] + j]; + const size_t rowStart = ia[localRow]; + ja[rowStart] = offset; + } + } + + // Compress storage (remove all INVALID, which happen when we skip zeros) + // We do that in-place + size_t ia_tmp = 0, nnz = 0; + for (size_t i = 0; i < numFineBlockRows; i++) { + for (size_t j = ia_tmp; j < ia[i + 1]; j++) + if (ja[j] != INVALID) { + ja[nnz] = ja[j]; + nnz++; + } + ia_tmp = ia[i + 1]; + ia[i + 1] = nnz; + } + + if (rowMap->lib() == Xpetra::UseTpetra) { + // - Cannot resize for Epetra, as it checks for same pointers + // - Need to resize for Tpetra, as it check ().size() == ia[numRows] + // NOTE: these invalidate ja and val views + jaPtent.resize(nnz); + } + + GetOStream(Runtime1) << "TentativePFactory : generating block graph" << std::endl; + BlockGraph->setAllIndices(iaPtent, jaPtent); + + // Managing labels & constants for ESFC + { + RCP FCparams; + if (pL.isSublist("matrixmatrix: kernel params")) + FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); + else + FCparams = rcp(new ParameterList); + // By default, we don't need global constants for TentativeP, but we do want it for the graph + // if we're printing statistics, so let's leave it on for now. + FCparams->set("compute global constants", FCparams->get("compute global constants", true)); + std::string levelIDs = toString(levelID); + FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs); + RCP dummy_e; + RCP dummy_i; + BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams); + } + + // Now let's make a BlockCrs Matrix + // NOTE: Assumes block size== NSDim + RCP> P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim); + RCP> P_tpetra = rcp_dynamic_cast>(P_xpetra); + if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix"); + RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + + ///////////////////////////// + // "no-QR" option // + ///////////////////////////// + // Local Q factor is just the fine nullspace support over the current aggregate. + // Local R factor is the identity. + // NOTE: We're not going to do a QR here as we're assuming that blocksize == NSDim + // NOTE: "goodMap" case only + Teuchos::Array block(NSDim * NSDim, zero); + Teuchos::Array bcol(1); + + GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; + for (LO agg = 0; agg < numAggs; agg++) { + bcol[0] = agg; + const LO aggSize = aggStart[agg + 1] - aggStart[agg]; + Xpetra::global_size_t offset = agg * NSDim; + + // Process each row in the local Q factor + // NOTE: Blocks are in row-major order + for (LO j = 0; j < aggSize; j++) { + const LO localBlockRow = aggToRowMapLO[aggStart[agg] + j]; + + for (size_t r = 0; r < NSDim; r++) { + LO localPointRow = localBlockRow * NSDim + r; + for (size_t c = 0; c < NSDim; c++) + block[r * NSDim + c] = fineNS[c][localPointRow]; + } + // NOTE: Assumes columns==aggs and are ordered sequentially + P_tpetra->replaceLocalValues(localBlockRow, bcol(), block()); + + } // end aggSize + + for (size_t j = 0; j < NSDim; j++) + coarseNS[j][offset + j] = one; + + } // for (GO agg = 0; agg < numAggs; agg++) + + Ptentative = P_wrap; +} + +template +void TentativePFactory:: + BuildPcoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const { + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType Magnitude; + const SC zero = STS::zero(); + const SC one = STS::one(); + + // number of aggregates + GO numAggs = aggregates->GetNumAggregates(); + + // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate. + // aggStart is a pointer into aggToRowMap + // aggStart[i]..aggStart[i+1] are indices into aggToRowMap + // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i + ArrayRCP aggStart; + ArrayRCP aggToRowMap; + amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap); - } else { - amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO); - GetOStream(Warnings0) << "Column map is not consistent with the row map\n" - << "using GO->LO conversion with performance penalty" << std::endl; + // find size of the largest aggregate + LO maxAggSize = 0; + for (GO i = 0; i < numAggs; ++i) { + LO sizeOfThisAgg = aggStart[i + 1] - aggStart[i]; + if (sizeOfThisAgg > maxAggSize) maxAggSize = sizeOfThisAgg; } - coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); - // Pull out the nullspace vectors so that we can have random access. - ArrayRCP > fineNS(NSDim); - ArrayRCP > coarseNS(NSDim); - for (size_t i = 0; i < NSDim; i++) { - fineNS[i] = fineNullspace->getData(i); - if (coarseMap->getLocalNumElements() > 0) - coarseNS[i] = coarseNullspace->getDataNonConst(i); - } + // dimension of fine level nullspace + const size_t NSDim = fineNullspace->getNumVectors(); - size_t nnzEstimate = numRows * NSDim; + // index base for coarse Dof map (usually 0) + GO indexBase = A->getRowMap()->getIndexBase(); - // Time to construct the matrix and fill in the values - Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0)); - RCP PtentCrs = rcp_dynamic_cast(Ptentative)->getCrsMatrix(); + const RCP nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates); + const RCP uniqueMap = A->getDomainMap(); + RCP importer = ImportFactory::Build(uniqueMap, nonUniqueMap); + RCP fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap, NSDim); + fineNullspaceWithOverlap->doImport(*fineNullspace, *importer, Xpetra::INSERT); - ArrayRCP iaPtent; - ArrayRCP jaPtent; - ArrayRCP valPtent; + // Pull out the nullspace vectors so that we can have random access. + ArrayRCP> fineNS(NSDim); + for (size_t i = 0; i < NSDim; ++i) + fineNS[i] = fineNullspaceWithOverlap->getData(i); - PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent); + // Allocate storage for the coarse nullspace. + coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); - ArrayView ia = iaPtent(); - ArrayView ja = jaPtent(); - ArrayView val = valPtent(); + ArrayRCP> coarseNS(NSDim); + for (size_t i = 0; i < NSDim; ++i) + if (coarseMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); - ia[0] = 0; - for (size_t i = 1; i <= numRows; i++) - ia[i] = ia[i - 1] + NSDim; + // This makes the rowmap of Ptent the same as that of A-> + // This requires moving some parts of some local Q's to other processors + // because aggregates can span processors. + RCP rowMapForPtent = A->getRowMap(); + const Map& rowMapForPtentRef = *rowMapForPtent; - for (size_t j = 0; j < nnzEstimate; j++) { - ja[j] = INVALID; - val[j] = zero; + // Set up storage for the rows of the local Qs that belong to other processors. + // FIXME This is inefficient and could be done within the main loop below with std::vector's. + RCP colMap = A->getColMap(); + + RCP ghostQMap; + RCP ghostQvalues; + Array>> ghostQcolumns; + RCP> ghostQrowNums; + ArrayRCP> ghostQvals; + ArrayRCP> ghostQcols; + ArrayRCP ghostQrows; + + Array ghostGIDs; + for (LO j = 0; j < numAggs; ++j) { + for (LO k = aggStart[j]; k < aggStart[j + 1]; ++k) { + if (rowMapForPtentRef.isNodeGlobalElement(aggToRowMap[k]) == false) { + ghostGIDs.push_back(aggToRowMap[k]); + } + } + } + ghostQMap = MapFactory::Build(A->getRowMap()->lib(), + Teuchos::OrdinalTraits::invalid(), + ghostGIDs, + indexBase, A->getRowMap()->getComm()); // JG:Xpetra::global_size_t>? + // Vector to hold bits of Q that go to other processors. + ghostQvalues = MultiVectorFactory::Build(ghostQMap, NSDim); + // Note that Epetra does not support MultiVectors templated on Scalar != double. + // So to work around this, we allocate an array of Vectors. This shouldn't be too + // expensive, as the number of Vectors is NSDim. + ghostQcolumns.resize(NSDim); + for (size_t i = 0; i < NSDim; ++i) + ghostQcolumns[i] = Xpetra::VectorFactory::Build(ghostQMap); + ghostQrowNums = Xpetra::VectorFactory::Build(ghostQMap); + if (ghostQvalues->getLocalLength() > 0) { + ghostQvals.resize(NSDim); + ghostQcols.resize(NSDim); + for (size_t i = 0; i < NSDim; ++i) { + ghostQvals[i] = ghostQvalues->getDataNonConst(i); + ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0); + } + ghostQrows = ghostQrowNums->getDataNonConst(0); } - if (doQRStep) { - //////////////////////////////// - // Standard aggregate-wise QR // - //////////////////////////////// - for (GO agg = 0; agg < numAggs; agg++) { - LO aggSize = aggStart[agg + 1] - aggStart[agg]; + // importer to handle moving Q + importer = ImportFactory::Build(ghostQMap, A->getRowMap()); - Xpetra::global_size_t offset = agg * NSDim; + // Dense QR solver + Teuchos::SerialQRDenseSolver qrSolver; - // Extract the piece of the nullspace corresponding to the aggregate, and - // put it in the flat array, "localQR" (in column major format) for the - // QR routine. - Teuchos::SerialDenseMatrix localQR(aggSize, NSDim); - if (goodMap) { - for (size_t j = 0; j < NSDim; j++) - for (LO k = 0; k < aggSize; k++) - localQR(k, j) = fineNS[j][aggToRowMapLO[aggStart[agg] + k]]; - } else { - for (size_t j = 0; j < NSDim; j++) - for (LO k = 0; k < aggSize; k++) - localQR(k, j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + k])]; - } + // Allocate temporary storage for the tentative prolongator. + Array globalColPtr(maxAggSize * NSDim, 0); + Array localColPtr(maxAggSize * NSDim, 0); + Array valPtr(maxAggSize * NSDim, 0.); - // Test for zero columns - for (size_t j = 0; j < NSDim; j++) { - bool bIsZeroNSColumn = true; + // Create column map for Ptent, estimate local #nonzeros in Ptent, and create Ptent itself. + const Map& coarseMapRef = *coarseMap; - for (LO k = 0; k < aggSize; k++) - if (localQR(k, j) != zero) - bIsZeroNSColumn = false; + // For the 3-arrays constructor + ArrayRCP ptent_rowptr; + ArrayRCP ptent_colind; + ArrayRCP ptent_values; - TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, - "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j); - } + // Because ArrayRCPs are slow... + ArrayView rowptr_v; + ArrayView colind_v; + ArrayView values_v; - // Calculate QR decomposition (standard) - // NOTE: Q is stored in localQR and R is stored in coarseNS - if (aggSize >= Teuchos::as(NSDim)) { - if (NSDim == 1) { - // Only one nullspace vector, calculate Q and R by hand - Magnitude norm = STS::magnitude(zero); - for (size_t k = 0; k < Teuchos::as(aggSize); k++) - norm += STS::magnitude(localQR(k, 0) * localQR(k, 0)); - norm = Teuchos::ScalarTraits::squareroot(norm); + // For temporary usage + Array rowptr_temp; + Array colind_temp; + Array values_temp; - // R = norm - coarseNS[0][offset] = norm; + RCP PtentCrs; - // Q = localQR(:,0)/norm - for (LO i = 0; i < aggSize; i++) - localQR(i, 0) /= norm; + RCP PtentCrsWrap = rcp(new CrsMatrixWrap(rowMapForPtent, NSDim)); + PtentCrs = PtentCrsWrap->getCrsMatrix(); + Ptentative = PtentCrsWrap; - } else { - Teuchos::SerialQRDenseSolver qrSolver; - qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); - qrSolver.factor(); + //***************************************************************** + // Loop over all aggregates and calculate local QR decompositions. + //***************************************************************** + GO qctr = 0; // for indexing into Ptent data vectors + const Map& nonUniqueMapRef = *nonUniqueMap; - // R = upper triangular part of localQR - for (size_t j = 0; j < NSDim; j++) - for (size_t k = 0; k <= j; k++) - coarseNS[j][offset + k] = localQR(k, j); // TODO is offset+k the correct local ID?! + size_t total_nnz_count = 0; - // Calculate Q, the tentative prolongator. - // The Lapack GEQRF call only works for myAggsize >= NSDim - qrSolver.formQ(); - Teuchos::RCP > qFactor = qrSolver.getQ(); - for (size_t j = 0; j < NSDim; j++) - for (size_t i = 0; i < Teuchos::as(aggSize); i++) - localQR(i, j) = (*qFactor)(i, j); + for (GO agg = 0; agg < numAggs; ++agg) { + LO myAggSize = aggStart[agg + 1] - aggStart[agg]; + // For each aggregate, extract the corresponding piece of the nullspace and put it in the flat array, + // "localQR" (in column major format) for the QR routine. + Teuchos::SerialDenseMatrix localQR(myAggSize, NSDim); + for (size_t j = 0; j < NSDim; ++j) { + bool bIsZeroNSColumn = true; + for (LO k = 0; k < myAggSize; ++k) { + // aggToRowMap[aggPtr[i]+k] is the kth DOF in the ith aggregate + // fineNS[j][n] is the nth entry in the jth NS vector + try { + SC nsVal = fineNS[j][nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k])]; // extract information from fine level NS + localQR(k, j) = nsVal; + if (nsVal != zero) bIsZeroNSColumn = false; + } catch (...) { + GetOStream(Runtime1, -1) << "length of fine level nsp: " << fineNullspace->getGlobalLength() << std::endl; + GetOStream(Runtime1, -1) << "length of fine level nsp w overlap: " << fineNullspaceWithOverlap->getGlobalLength() << std::endl; + GetOStream(Runtime1, -1) << "(local?) aggId=" << agg << std::endl; + GetOStream(Runtime1, -1) << "aggSize=" << myAggSize << std::endl; + GetOStream(Runtime1, -1) << "agg DOF=" << k << std::endl; + GetOStream(Runtime1, -1) << "NS vector j=" << j << std::endl; + GetOStream(Runtime1, -1) << "j*myAggSize + k = " << j * myAggSize + k << std::endl; + GetOStream(Runtime1, -1) << "aggToRowMap[" << agg << "][" << k << "] = " << aggToRowMap[aggStart[agg] + k] << std::endl; + GetOStream(Runtime1, -1) << "id aggToRowMap[agg][k]=" << aggToRowMap[aggStart[agg] + k] << " is global element in nonUniqueMap = " << nonUniqueMapRef.isNodeGlobalElement(aggToRowMap[aggStart[agg] + k]) << std::endl; + GetOStream(Runtime1, -1) << "colMap local id aggToRowMap[agg][k]=" << nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k]) << std::endl; + GetOStream(Runtime1, -1) << "fineNS...=" << fineNS[j][nonUniqueMapRef.getLocalElement(aggToRowMap[aggStart[agg] + k])] << std::endl; + GetOStream(Errors, -1) << "caught an error!" << std::endl; } + } // for (LO k=0 ... + TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column. Error."); + } // for (LO j=0 ... - } else { - // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics) - - // The local QR decomposition is not possible in the "overconstrained" - // case (i.e. number of columns in localQR > number of rows), which - // corresponds to #DOFs in Aggregate < NSDim. For usual problems this - // is only possible for single node aggregates in structural mechanics. - // (Similar problems may arise in discontinuous Galerkin problems...) - // We bypass the QR decomposition and use an identity block in the - // tentative prolongator for the single node aggregate and transfer the - // corresponding fine level null space information 1-to-1 to the coarse - // level null space part. + Xpetra::global_size_t offset = agg * NSDim; - // NOTE: The resulting tentative prolongation operator has - // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular - // coarse level operator A. To deal with that one has the following - // options: - // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: - // false) to add some identity block to the diagonal of the zero rows - // in the coarse level operator A, such that standard level smoothers - // can be used again. - // - Use special (projection-based) level smoothers, which can deal - // with singular matrices (very application specific) - // - Adapt the code below to avoid zero columns. However, we do not - // support a variable number of DOFs per node in MueLu/Xpetra which - // makes the implementation really hard. + if (myAggSize >= Teuchos::as(NSDim)) { + // calculate QR decomposition (standard) + // R is stored in localQR (size: myAggSize x NSDim) - // R = extended (by adding identity rows) localQR - for (size_t j = 0; j < NSDim; j++) - for (size_t k = 0; k < NSDim; k++) - if (k < as(aggSize)) - coarseNS[j][offset + k] = localQR(k, j); - else - coarseNS[j][offset + k] = (k == j ? one : zero); + // Householder multiplier + SC tau = localQR(0, 0); - // Q = I (rectangular) - for (size_t i = 0; i < as(aggSize); i++) - for (size_t j = 0; j < NSDim; j++) - localQR(i, j) = (j == i ? one : zero); + if (NSDim == 1) { + // Only one nullspace vector, so normalize by hand + Magnitude dtemp = 0; + for (size_t k = 0; k < Teuchos::as(myAggSize); ++k) { + Magnitude tmag = STS::magnitude(localQR(k, 0)); + dtemp += tmag * tmag; + } + dtemp = Teuchos::ScalarTraits::squareroot(dtemp); + tau = localQR(0, 0); + localQR(0, 0) = dtemp; + } else { + qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); + qrSolver.factor(); } - // Process each row in the local Q factor - // FIXME: What happens if maps are blocked? - for (LO j = 0; j < aggSize; j++) { - LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg] + j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j])); - - size_t rowStart = ia[localRow]; - for (size_t k = 0, lnnz = 0; k < NSDim; k++) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - if (localQR(j, k) != zero) { - ja[rowStart + lnnz] = offset + k; - val[rowStart + lnnz] = localQR(j, k); - lnnz++; + // Extract R, the coarse nullspace. This is stored in upper triangular part of localQR. + // Note: coarseNS[i][.] is the ith coarse nullspace vector, which may be counter to your intuition. + // This stores the (offset+k)th entry only if it is local according to the coarseMap. + for (size_t j = 0; j < NSDim; ++j) { + for (size_t k = 0; k <= j; ++k) { + try { + if (coarseMapRef.isNodeLocalElement(offset + k)) { + coarseNS[j][offset + k] = localQR(k, j); // TODO is offset+k the correct local ID?! + } + } catch (...) { + GetOStream(Errors, -1) << "caught error in coarseNS insert, j=" << j << ", offset+k = " << offset + k << std::endl; } } } - } - } else { - GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; - if (NSDim > 1) - GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl; - ///////////////////////////// - // "no-QR" option // - ///////////////////////////// - // Local Q factor is just the fine nullspace support over the current aggregate. - // Local R factor is the identity. - // TODO I have not implemented any special handling for aggregates that are too - // TODO small to locally support the nullspace, as is done in the standard QR - // TODO case above. - if (goodMap) { - for (GO agg = 0; agg < numAggs; agg++) { - const LO aggSize = aggStart[agg + 1] - aggStart[agg]; - Xpetra::global_size_t offset = agg * NSDim; + // Calculate Q, the tentative prolongator. + // The Lapack GEQRF call only works for myAggsize >= NSDim - // Process each row in the local Q factor - // FIXME: What happens if maps are blocked? - for (LO j = 0; j < aggSize; j++) { - // TODO Here I do not check for a zero nullspace column on the aggregate. - // as is done in the standard QR case. + if (NSDim == 1) { + // Only one nullspace vector, so calculate Q by hand + Magnitude dtemp = Teuchos::ScalarTraits::magnitude(localQR(0, 0)); + localQR(0, 0) = tau; + dtemp = 1 / dtemp; + for (LocalOrdinal i = 0; i < myAggSize; ++i) { + localQR(i, 0) *= dtemp; + } + } else { + qrSolver.formQ(); + Teuchos::RCP> qFactor = qrSolver.getQ(); + for (size_t j = 0; j < NSDim; j++) { + for (size_t i = 0; i < Teuchos::as(myAggSize); i++) { + localQR(i, j) = (*qFactor)(i, j); + } + } + } - const LO localRow = aggToRowMapLO[aggStart[agg] + j]; + // end default case (myAggSize >= NSDim) + } else { // special handling for myAggSize < NSDim (i.e. 1pt nodes) + // See comments for the uncoupled case - const size_t rowStart = ia[localRow]; + // R = extended (by adding identity rows) localQR + for (size_t j = 0; j < NSDim; j++) + for (size_t k = 0; k < NSDim; k++) { + TEUCHOS_TEST_FOR_EXCEPTION(!coarseMapRef.isNodeLocalElement(offset + k), Exceptions::RuntimeError, + "Caught error in coarseNS insert, j=" << j << ", offset+k = " << offset + k); - for (size_t k = 0, lnnz = 0; k < NSDim; k++) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg] + j]]; - if (constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; - if (qr_jk != zero) { - ja[rowStart + lnnz] = offset + k; - val[rowStart + lnnz] = qr_jk; - lnnz++; - } - } + if (k < as(myAggSize)) + coarseNS[j][offset + k] = localQR(k, j); + else + coarseNS[j][offset + k] = (k == j ? one : zero); } - for (size_t j = 0; j < NSDim; j++) - coarseNS[j][offset + j] = one; - } // for (GO agg = 0; agg < numAggs; agg++) - } else { - for (GO agg = 0; agg < numAggs; agg++) { - const LO aggSize = aggStart[agg + 1] - aggStart[agg]; - Xpetra::global_size_t offset = agg * NSDim; - for (LO j = 0; j < aggSize; j++) { - const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j]); + // Q = I (rectangular) + for (size_t i = 0; i < as(myAggSize); i++) + for (size_t j = 0; j < NSDim; j++) + localQR(i, j) = (j == i ? one : zero); + } // end else (special handling for 1pt aggregates) - const size_t rowStart = ia[localRow]; + // Process each row in the local Q factor. If the row is local to the current processor + // according to the rowmap, insert it into Ptentative. Otherwise, save it in ghostQ + // to be communicated later to the owning processor. + // FIXME -- what happens if maps are blocked? + for (GO j = 0; j < myAggSize; ++j) { + // This loop checks whether row associated with current DOF is local, according to rowMapForPtent. + // If it is, the row is inserted. If not, the row number, columns, and values are saved in + // MultiVectors that will be sent to other processors. + GO globalRow = aggToRowMap[aggStart[agg] + j]; - for (size_t k = 0, lnnz = 0; k < NSDim; ++k) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg] + j])]; - if (constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; - if (qr_jk != zero) { - ja[rowStart + lnnz] = offset + k; - val[rowStart + lnnz] = qr_jk; - lnnz++; + // TODO is the use of Xpetra::global_size_t below correct? + if (rowMapForPtentRef.isNodeGlobalElement(globalRow) == false) { + ghostQrows[qctr] = globalRow; + for (size_t k = 0; k < NSDim; ++k) { + ghostQcols[k][qctr] = coarseMapRef.getGlobalElement(agg * NSDim + k); + ghostQvals[k][qctr] = localQR(j, k); + } + ++qctr; + } else { + size_t nnz = 0; + for (size_t k = 0; k < NSDim; ++k) { + try { + if (localQR(j, k) != Teuchos::ScalarTraits::zero()) { + localColPtr[nnz] = agg * NSDim + k; + globalColPtr[nnz] = coarseMapRef.getGlobalElement(localColPtr[nnz]); + valPtr[nnz] = localQR(j, k); + ++total_nnz_count; + ++nnz; } + } catch (...) { + GetOStream(Errors, -1) << "caught error in colPtr/valPtr insert, current index=" << nnz << std::endl; } + } // for (size_t k=0; kinsertGlobalValues(globalRow, globalColPtr.view(0, nnz), valPtr.view(0, nnz)); + } catch (...) { + GetOStream(Errors, -1) << "pid " << A->getRowMap()->getComm()->getRank() + << "caught error during Ptent row insertion, global row " + << globalRow << std::endl; } - for (size_t j = 0; j < NSDim; j++) - coarseNS[j][offset + j] = one; - } // for (GO agg = 0; agg < numAggs; agg++) + } + } // for (GO j=0; j> targetQrowNums = Xpetra::VectorFactory::Build(rowMapForPtent); + targetQrowNums->putScalar(-1); + targetQrowNums->doImport(*ghostQrowNums, *importer, Xpetra::INSERT); + ArrayRCP targetQrows = targetQrowNums->getDataNonConst(0); - // Compress storage (remove all INVALID, which happen when we skip zeros) - // We do that in-place - size_t ia_tmp = 0, nnz = 0; - for (size_t i = 0; i < numRows; i++) { - for (size_t j = ia_tmp; j < ia[i + 1]; j++) - if (ja[j] != INVALID) { - ja[nnz] = ja[j]; - val[nnz] = val[j]; - nnz++; - } - ia_tmp = ia[i + 1]; - ia[i + 1] = nnz; - } - if (rowMap->lib() == Xpetra::UseTpetra) { - // - Cannot resize for Epetra, as it checks for same pointers - // - Need to resize for Tpetra, as it check ().size() == ia[numRows] - // NOTE: these invalidate ja and val views - jaPtent.resize(nnz); - valPtent.resize(nnz); + // Now create map based on just the row numbers imported. + Array gidsToImport; + gidsToImport.reserve(targetQrows.size()); + for (typename ArrayRCP::iterator r = targetQrows.begin(); r != targetQrows.end(); ++r) { + if (*r > -1) { + gidsToImport.push_back(*r); + } } + RCP reducedMap = MapFactory::Build(A->getRowMap()->lib(), + Teuchos::OrdinalTraits::invalid(), + gidsToImport, indexBase, A->getRowMap()->getComm()); - GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl; + // Import using the row numbers that this processor will receive. + importer = ImportFactory::Build(ghostQMap, reducedMap); - PtentCrs->setAllValues(iaPtent, jaPtent, valPtent); + Array>> targetQcolumns(NSDim); + for (size_t i = 0; i < NSDim; ++i) { + targetQcolumns[i] = Xpetra::VectorFactory::Build(reducedMap); + targetQcolumns[i]->doImport(*(ghostQcolumns[i]), *importer, Xpetra::INSERT); + } + RCP targetQvalues = MultiVectorFactory::Build(reducedMap, NSDim); + targetQvalues->doImport(*ghostQvalues, *importer, Xpetra::INSERT); - // Managing labels & constants for ESFC - RCP FCparams; - if (pL.isSublist("matrixmatrix: kernel params")) - FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); - else - FCparams = rcp(new ParameterList); - // By default, we don't need global constants for TentativeP - FCparams->set("compute global constants", FCparams->get("compute global constants", false)); - std::string levelIDs = toString(levelID); - FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs); - RCP dummy_e; - RCP dummy_i; + ArrayRCP> targetQvals; + ArrayRCP> targetQcols; + if (targetQvalues->getLocalLength() > 0) { + targetQvals.resize(NSDim); + targetQcols.resize(NSDim); + for (size_t i = 0; i < NSDim; ++i) { + targetQvals[i] = targetQvalues->getDataNonConst(i); + targetQcols[i] = targetQcolumns[i]->getDataNonConst(0); + } + } - PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(), dummy_i, dummy_e, FCparams); + valPtr = Array(NSDim, 0.); + globalColPtr = Array(NSDim, 0); + for (typename Array::iterator r = gidsToImport.begin(); r != gidsToImport.end(); ++r) { + if (targetQvalues->getLocalLength() > 0) { + for (size_t j = 0; j < NSDim; ++j) { + valPtr[j] = targetQvals[j][reducedMap->getLocalElement(*r)]; + globalColPtr[j] = targetQcols[j][reducedMap->getLocalElement(*r)]; + } + Ptentative->insertGlobalValues(*r, globalColPtr.view(0, NSDim), valPtr.view(0, NSDim)); + } // if (targetQvalues->getLocalLength() > 0) + } + + Ptentative->fillComplete(coarseMap, A->getDomainMap()); } } // namespace MueLu diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp index 50990299c898..04e1a6831173 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp @@ -10,21 +10,19 @@ #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DECL_HPP #define MUELU_TENTATIVEPFACTORY_KOKKOS_DECL_HPP -#include "MueLu_ConfigDefs.hpp" - -#include "MueLu_TentativePFactory_kokkos_fwd.hpp" - +#include #include -#include "Teuchos_ScalarTraits.hpp" +#include -#include "Xpetra_CrsGraphFactory_fwd.hpp" +#include "MueLu_ConfigDefs.hpp" #include "MueLu_Aggregates_fwd.hpp" #include "MueLu_AmalgamationInfo_fwd.hpp" #include "MueLu_Level_fwd.hpp" #include "MueLu_PerfUtils_fwd.hpp" #include "MueLu_PFactory.hpp" +#include "MueLu_Utilities_fwd.hpp" namespace MueLu { @@ -66,17 +64,15 @@ namespace MueLu { | P | TentativePFactory | Non-smoothed "tentative" prolongation operator (with piece-wise constant transfer operator basis functions) | Nullspace | TentativePFactory | Coarse near null space vectors. Please also check the documentation of the NullspaceFactory for the special dependency tree of the "Nullspace" variable throughout all multigrid levels. */ -template +template class TentativePFactory_kokkos : public PFactory { public: - typedef LocalOrdinal local_ordinal_type; - typedef GlobalOrdinal global_ordinal_type; - typedef typename Node::execution_space execution_space; - typedef Kokkos::RangePolicy range_type; - typedef typename Node::device_type DeviceType; - typedef Node node_type; - typedef typename Teuchos::ScalarTraits::coordinateType real_type; - typedef Xpetra::MultiVector RealValuedMultiVector; + using execution_space = typename Node::execution_space; + using range_type = Kokkos::RangePolicy; + using DeviceType = typename Node::device_type; private: #undef MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT @@ -87,46 +83,44 @@ class TentativePFactory_kokkos : public PFactory { //@{ //! Constructor - TentativePFactory_kokkos() {} + TentativePFactory_kokkos(); //! Destructor. - virtual ~TentativePFactory_kokkos() {} + ~TentativePFactory_kokkos() override; //@} - RCP GetValidParameterList() const; + RCP GetValidParameterList() const override; //! Input //@{ - void DeclareInput(Level& fineLevel, Level& coarseLevel) const; + void DeclareInput(Level& fineLevel, Level& coarseLevel) const override; //@} //! @name Build methods. //@{ - void Build(Level& fineLevel, Level& coarseLevel) const; - void BuildP(Level& fineLevel, Level& coarseLevel) const; + void Build(Level& fineLevel, Level& coarseLevel) const override; + void BuildP(Level& fineLevel, Level& coarseLevel) const override; //@} - // NOTE: All of thess should really be private, but CUDA doesn't like that + // NOTE: All of these should really be private, but CUDA doesn't like that - void BuildPuncoupledBlockCrs(Level& coarseLevel, RCP A, RCP aggregates, RCP amalgInfo, - RCP fineNullspace, RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const; + void BuildPuncoupled(Level& coarseLevel, RCP A, RCP aggregates, + RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, + RCP& coarseNullspace, int levelID) const; - bool isGoodMap(const Map& rowMap, const Map& colMap) const; + void BuildPuncoupledBlockCrs(Level& coarseLevel, RCP A, RCP aggregates, RCP amalgInfo, + RCP fineNullspace, RCP coarsePointMap, RCP& Ptentative, RCP& coarseNullspace, int levelID) const; void BuildPcoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const; - void BuildPuncoupled(Level& coarseLevel, RCP A, RCP aggregates, - RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, - RCP& coarseNullspace, const int levelID) const; - mutable bool bTransferCoordinates_ = false; }; diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp index ab9c4513fb10..32c6c9487004 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp @@ -10,334 +10,25 @@ #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP -#include "Kokkos_UnorderedMap.hpp" -#include "Xpetra_CrsGraphFactory.hpp" +#include #include "MueLu_TentativePFactory_kokkos_decl.hpp" #include "MueLu_Aggregates.hpp" #include "MueLu_AmalgamationInfo.hpp" - #include "MueLu_MasterList.hpp" -#include "MueLu_PerfUtils.hpp" #include "MueLu_Monitor.hpp" - -#include "Xpetra_IO.hpp" +#include "MueLu_PerfUtils.hpp" +#include "MueLu_Utilities.hpp" +#include "MueLu_LocalQR.hpp" namespace MueLu { -namespace { // anonymous - -template -class ReduceMaxFunctor { - public: - ReduceMaxFunctor(View view) - : view_(view) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const { - if (vmax < view_(i)) - vmax = view_(i); - } - - KOKKOS_INLINE_FUNCTION - void join(LocalOrdinal& dst, const LocalOrdinal& src) const { - if (dst < src) { - dst = src; - } - } - - KOKKOS_INLINE_FUNCTION - void init(LocalOrdinal& dst) const { - dst = 0; - } - - private: - View view_; -}; - -// local QR decomposition -template -class LocalQRDecompFunctor { - private: - typedef LOType LO; - typedef GOType GO; - typedef SCType SC; - - typedef typename DeviceType::execution_space execution_space; - typedef typename Kokkos::ArithTraits::val_type impl_SC; - typedef Kokkos::ArithTraits impl_ATS; - typedef typename impl_ATS::magnitudeType Magnitude; - - typedef Kokkos::View shared_matrix; - typedef Kokkos::View shared_vector; - - private: - NspType fineNS; - NspType coarseNS; - aggRowsType aggRows; - maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode) - agg2RowMapLOType agg2RowMapLO; - statusType statusAtomic; - rowsType rows; - rowsAuxType rowsAux; - colsAuxType colsAux; - valsAuxType valsAux; - bool doQRStep; - - public: - LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) - : fineNS(fineNS_) - , coarseNS(coarseNS_) - , aggRows(aggRows_) - , maxAggDofSize(maxAggDofSize_) - , agg2RowMapLO(agg2RowMapLO_) - , statusAtomic(statusAtomic_) - , rows(rows_) - , rowsAux(rowsAux_) - , colsAux(colsAux_) - , valsAux(valsAux_) - , doQRStep(doQRStep_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& thread, size_t& nnz) const { - auto agg = thread.league_rank(); - - // size of aggregate: number of DOFs in aggregate - auto aggSize = aggRows(agg + 1) - aggRows(agg); - - const impl_SC one = impl_ATS::one(); - const impl_SC two = one + one; - const impl_SC zero = impl_ATS::zero(); - const auto zeroM = impl_ATS::magnitude(zero); - - int m = aggSize; - int n = fineNS.extent(1); - - // calculate row offset for coarse nullspace - Xpetra::global_size_t offset = agg * n; - - if (doQRStep) { - // Extract the piece of the nullspace corresponding to the aggregate - shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end) - for (int j = 0; j < n; j++) - for (int k = 0; k < m; k++) - r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j); -#if 0 - printf("A\n"); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) - printf(" %5.3lf ", r(i,j)); - printf("\n"); - } -#endif - - // Calculate QR decomposition (standard) - shared_matrix q(thread.team_shmem(), m, m); // Q - if (m >= n) { - bool isSingular = false; - - // Initialize Q^T - auto qt = q; - for (int i = 0; i < m; i++) { - for (int j = 0; j < m; j++) - qt(i, j) = zero; - qt(i, i) = one; - } - - for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize - // FIXME_KOKKOS: use team - Magnitude s = zeroM, norm, norm_x; - for (int i = k + 1; i < m; i++) - s += pow(impl_ATS::magnitude(r(i, k)), 2); - norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); - - if (norm == zero) { - isSingular = true; - break; - } - - r(k, k) -= norm * one; - - norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); - if (norm_x == zeroM) { - // We have a single diagonal element in the column. - // No reflections required. Just need to restor r(k,k). - r(k, k) = norm * one; - continue; - } - - // FIXME_KOKKOS: use team - for (int i = k; i < m; i++) - r(i, k) /= norm_x; - - // Update R(k:m,k+1:n) - for (int j = k + 1; j < n; j++) { - // FIXME_KOKKOS: use team in the loops - impl_SC si = zero; - for (int i = k; i < m; i++) - si += r(i, k) * r(i, j); - for (int i = k; i < m; i++) - r(i, j) -= two * si * r(i, k); - } - - // Update Q^T (k:m,k:m) - for (int j = k; j < m; j++) { - // FIXME_KOKKOS: use team in the loops - impl_SC si = zero; - for (int i = k; i < m; i++) - si += r(i, k) * qt(i, j); - for (int i = k; i < m; i++) - qt(i, j) -= two * si * r(i, k); - } - - // Fix R(k:m,k) - r(k, k) = norm * one; - for (int i = k + 1; i < m; i++) - r(i, k) = zero; - } - -#if 0 - // Q = (Q^T)^T - for (int i = 0; i < m; i++) - for (int j = 0; j < i; j++) { - impl_SC tmp = qt(i,j); - qt(i,j) = qt(j,i); - qt(j,i) = tmp; - } -#endif - - // Build coarse nullspace using the upper triangular part of R - for (int j = 0; j < n; j++) - for (int k = 0; k <= j; k++) - coarseNS(offset + k, j) = r(k, j); - - if (isSingular) { - statusAtomic(1) = true; - return; - } - - } else { - // Special handling for m < n (i.e. single node aggregates in structural mechanics) - - // The local QR decomposition is not possible in the "overconstrained" - // case (i.e. number of columns in qr > number of rowsAux), which - // corresponds to #DOFs in Aggregate < n. For usual problems this - // is only possible for single node aggregates in structural mechanics. - // (Similar problems may arise in discontinuous Galerkin problems...) - // We bypass the QR decomposition and use an identity block in the - // tentative prolongator for the single node aggregate and transfer the - // corresponding fine level null space information 1-to-1 to the coarse - // level null space part. - - // NOTE: The resulting tentative prolongation operator has - // (m*DofsPerNode-n) zero columns leading to a singular - // coarse level operator A. To deal with that one has the following - // options: - // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: - // false) to add some identity block to the diagonal of the zero rowsAux - // in the coarse level operator A, such that standard level smoothers - // can be used again. - // - Use special (projection-based) level smoothers, which can deal - // with singular matrices (very application specific) - // - Adapt the code below to avoid zero columns. However, we do not - // support a variable number of DOFs per node in MueLu/Xpetra which - // makes the implementation really hard. - // - // FIXME: do we need to check for singularity here somehow? Zero - // columns would be easy but linear dependency would require proper QR. - - // R = extended (by adding identity rowsAux) qr - for (int j = 0; j < n; j++) - for (int k = 0; k < n; k++) - if (k < m) - coarseNS(offset + k, j) = r(k, j); - else - coarseNS(offset + k, j) = (k == j ? one : zero); - - // Q = I (rectangular) - for (int i = 0; i < m; i++) - for (int j = 0; j < n; j++) - q(i, j) = (j == i ? one : zero); - } - - // Process each row in the local Q factor and fill helper arrays to assemble P - for (int j = 0; j < m; j++) { - LO localRow = agg2RowMapLO(aggRows(agg) + j); - size_t rowStart = rowsAux(localRow); - size_t lnnz = 0; - for (int k = 0; k < n; k++) { - // skip zeros - if (q(j, k) != zero) { - colsAux(rowStart + lnnz) = offset + k; - valsAux(rowStart + lnnz) = q(j, k); - lnnz++; - } - } - rows(localRow + 1) = lnnz; - nnz += lnnz; - } - -#if 0 - printf("R\n"); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) - printf(" %5.3lf ", coarseNS(i,j)); - printf("\n"); - } - - printf("Q\n"); - for (int i = 0; i < aggSize; i++) { - for (int j = 0; j < aggSize; j++) - printf(" %5.3lf ", q(i,j)); - printf("\n"); - } -#endif - } else { - ///////////////////////////// - // "no-QR" option // - ///////////////////////////// - // Local Q factor is just the fine nullspace support over the current aggregate. - // Local R factor is the identity. - // TODO I have not implemented any special handling for aggregates that are too - // TODO small to locally support the nullspace, as is done in the standard QR - // TODO case above. - - for (int j = 0; j < m; j++) { - LO localRow = agg2RowMapLO(aggRows(agg) + j); - size_t rowStart = rowsAux(localRow); - size_t lnnz = 0; - for (int k = 0; k < n; k++) { - const impl_SC qr_jk = fineNS(localRow, k); - // skip zeros - if (qr_jk != zero) { - colsAux(rowStart + lnnz) = offset + k; - valsAux(rowStart + lnnz) = qr_jk; - lnnz++; - } - } - rows(localRow + 1) = lnnz; - nnz += lnnz; - } - - for (int j = 0; j < n; j++) - coarseNS(offset + j, j) = one; - } - } - - // amount of shared memory - size_t team_shmem_size(int /* team_size */) const { - if (doQRStep) { - int m = maxAggDofSize; - int n = fineNS.extent(1); - return shared_matrix::shmem_size(m, n) + // r - shared_matrix::shmem_size(m, m); // q - } else - return 0; - } -}; +template +TentativePFactory_kokkos::TentativePFactory_kokkos() = default; -} // namespace +template +TentativePFactory_kokkos::~TentativePFactory_kokkos() = default; template RCP TentativePFactory_kokkos::GetValidParameterList() const { @@ -346,7 +37,9 @@ RCP TentativePFactory_kokkossetEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("tentative: calculate qr"); SET_VALID_ENTRY("tentative: build coarse coordinates"); + // SET_VALID_ENTRY("tentative: constant column sums"); #undef SET_VALID_ENTRY + validParamList->set("Nullspace name", "Nullspace", "Name for the input nullspace"); validParamList->set>("A", Teuchos::null, "Generating factory of the matrix A"); validParamList->set>("Aggregates", Teuchos::null, "Generating factory of the aggregates"); @@ -397,13 +90,24 @@ void TentativePFactory_kokkos::BuildP FactoryMonitor m(*this, "Build", coarseLevel); typedef typename Teuchos::ScalarTraits::coordinateType coordinate_type; + typedef Xpetra::MultiVector RealValuedMultiVector; typedef Xpetra::MultiVectorFactory RealValuedMultiVectorFactory; + const ParameterList& pL = GetParameterList(); std::string nspName = "Nullspace"; if (pL.isParameter("Nullspace name")) nspName = pL.get("Nullspace name"); - auto A = Get>(fineLevel, "A"); - auto aggregates = Get>(fineLevel, "Aggregates"); + RCP Ptentative; + auto A = Get>(fineLevel, "A"); + auto aggregates = Get>(fineLevel, "Aggregates"); + // No coarse DoFs so we need to bail by setting Ptentattive to null and returning + // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize() + if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) { + Ptentative = Teuchos::null; + Set(coarseLevel, "P", Ptentative); + return; + } + auto amalgInfo = Get>(fineLevel, "UnAmalgamationInfo"); auto fineNullspace = Get>(fineLevel, nspName); auto coarseMap = Get>(fineLevel, "CoarseMap"); @@ -412,14 +116,16 @@ void TentativePFactory_kokkos::BuildP fineCoords = Get>(fineLevel, "Coordinates"); } - RCP Ptentative; - // No coarse DoFs so we need to bail by setting Ptentattive to null and returning - // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize() - if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) { - Ptentative = Teuchos::null; - Set(coarseLevel, "P", Ptentative); - return; + // FIXME: We should remove the NodeComm on levels past the threshold + if (fineLevel.IsAvailable("Node Comm")) { + RCP> nodeComm = Get>>(fineLevel, "Node Comm"); + Set>>(coarseLevel, "Node Comm", nodeComm); } + + // NOTE: We check DomainMap here rather than RowMap because those are different for BlockCrs matrices + TEUCHOS_TEST_FOR_EXCEPTION(A->getDomainMap()->getLocalNumElements() != fineNullspace->getMap()->getLocalNumElements(), + Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: Size mismatch between A and Nullspace"); + RCP coarseNullspace; RCP coarseCoords; @@ -486,7 +192,7 @@ void TentativePFactory_kokkos::BuildP typename AppendTrait::type fineCoordsRandomView = fineCoordsView; for (size_t j = 0; j < dim; j++) { Kokkos::parallel_for( - "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy(0, numAggs), + "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy(0, numAggs), KOKKOS_LAMBDA(const LO i) { // A row in this graph represents all node ids in the aggregate // Therefore, averaging is very easy @@ -529,13 +235,6 @@ void TentativePFactory_kokkos::BuildP if (bTransferCoordinates_) { Set(coarseLevel, "Coordinates", coarseCoords); } - - // FIXME: We should remove the NodeComm on levels past the threshold - if (fineLevel.IsAvailable("Node Comm")) { - RCP> nodeComm = Get>>(fineLevel, "Node Comm"); - Set>>(coarseLevel, "Node Comm", nodeComm); - } - Set(coarseLevel, "Nullspace", coarseNullspace); Set(coarseLevel, "P", Ptentative); @@ -561,7 +260,8 @@ void TentativePFactory_kokkos:: typedef Kokkos::ArithTraits ATS; using impl_SC = typename ATS::val_type; using impl_ATS = Kokkos::ArithTraits; - const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one(); + const impl_SC zero = impl_ATS::zero(); + const impl_SC one = impl_ATS::one(); const LO INVALID = Teuchos::OrdinalTraits::invalid(); @@ -579,7 +279,7 @@ void TentativePFactory_kokkos:: bool goodMap; { SubFactoryMonitor m2(*this, "Check good map", coarseLevel); - goodMap = isGoodMap(*rowMap, *colMap); + goodMap = Utilities::MapsAreNested(*rowMap, *colMap); } // FIXME_KOKKOS: need to proofread later code for bad maps TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError, @@ -593,7 +293,10 @@ void TentativePFactory_kokkos:: // the data of the AmalgamationInfo container class // Extract information for unamalgamation - LO fullBlockSize, blockID, stridingOffset, stridedBlockSize; + LO fullBlockSize; + LO blockID; + LO stridingOffset; + LO stridedBlockSize; GO indexBase; amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase); GO globalOffset = amalgInfo->GlobalOffset(); @@ -652,7 +355,7 @@ void TentativePFactory_kokkos:: // Find maximum dof size for aggregates // Later used to reserve enough scratch space for local QR decompositions LO maxAggSize = 0; - ReduceMaxFunctor reduceMax(aggDofSizes); + LocalQR::ReduceMaxFunctor reduceMax(aggDofSizes); Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize); // parallel_scan (exclusive) @@ -860,10 +563,10 @@ void TentativePFactory_kokkos:: // Each team handles a slice of the data associated with one aggregate // and performs a local QR decomposition const Kokkos::TeamPolicy policy(numAggregates, 1); // numAggregates teams a 1 thread - LocalQRDecompFunctor + LocalQR::LocalQRDecompFunctor localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic, rows, rowsAux, colsAux, valsAux, doQRStep); Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz); @@ -1019,7 +722,7 @@ void TentativePFactory_kokkos:: // Aggregates map is based on the amalgamated column map // We can skip global-to-local conversion if LIDs in row map are // same as LIDs in column map - bool goodMap = MueLu::Utilities::MapsAreNested(*rowMap, *colMap); + bool goodMap = Utilities::MapsAreNested(*rowMap, *colMap); TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError, "MueLu: TentativePFactory_kokkos: for now works only with good maps " "(i.e. \"matching\" row and column maps)"); @@ -1031,7 +734,10 @@ void TentativePFactory_kokkos:: // the data of the AmalgamationInfo container class // Extract information for unamalgamation - LO fullBlockSize, blockID, stridingOffset, stridedBlockSize; + LO fullBlockSize; + LO blockID; + LO stridingOffset; + LO stridedBlockSize; GO indexBase; amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase); // GO globalOffset = amalgInfo->GlobalOffset(); @@ -1061,7 +767,7 @@ void TentativePFactory_kokkos:: // Find maximum dof size for aggregates // Later used to reserve enough scratch space for local QR decompositions LO maxAggSize = 0; - ReduceMaxFunctor reduceMax(aggDofSizes); + LocalQR::ReduceMaxFunctor reduceMax(aggDofSizes); Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize); // parallel_scan (exclusive) @@ -1269,29 +975,6 @@ void TentativePFactory_kokkos:: throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented"); } -template -bool TentativePFactory_kokkos:: - isGoodMap(const Map& rowMap, const Map& colMap) const { - auto rowLocalMap = rowMap.getLocalMap(); - auto colLocalMap = colMap.getLocalMap(); - - const size_t numRows = rowLocalMap.getLocalNumElements(); - const size_t numCols = colLocalMap.getLocalNumElements(); - - if (numCols < numRows) - return false; - - size_t numDiff = 0; - Kokkos::parallel_reduce( - "MueLu:TentativePF:isGoodMap", range_type(0, numRows), - KOKKOS_LAMBDA(const LO i, size_t& diff) { - diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i)); - }, - numDiff); - - return (numDiff == 0); -} - } // namespace MueLu #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT From df226ba9dc12589930ac83e168b5beed10947755 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 5 Sep 2024 17:31:05 -0600 Subject: [PATCH 3/4] MueLu TentativePFactory_kokkos: Swicth to KokkosKernels batched BLAS for QR Signed-off-by: Christian Glusa --- .../Smoothed-Aggregation/MueLu_LocalQR.hpp | 224 ++++++++++-------- .../MueLu_TentativePFactory_kokkos_def.hpp | 6 +- .../TentativePFactory_kokkos.cpp | 2 + 3 files changed, 126 insertions(+), 106 deletions(-) diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp index 733b16dc2c9c..40fd02b9e609 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_LocalQR.hpp @@ -5,6 +5,21 @@ #include #include "Xpetra_ConfigDefs.hpp" +#include "KokkosBlas1_set.hpp" +#include "KokkosBatched_QR_FormQ_TeamVector_Internal.hpp" +#include "KokkosBatched_ApplyQ_Decl.hpp" +#include "KokkosBatched_SetIdentity_Decl.hpp" +#include "KokkosBatched_SetIdentity_Impl.hpp" +#include "Kokkos_DualView.hpp" +#include "Kokkos_Pair.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include "KokkosBatched_QR_Decl.hpp" +#include "KokkosBatched_QR_Serial_Impl.hpp" +#include "KokkosBatched_QR_TeamVector_Impl.hpp" +#include "KokkosBatched_QR_FormQ_TeamVector_Internal.hpp" + +// #define MUELU_DEBUG_QR 1 + namespace MueLu::LocalQR { template @@ -35,6 +50,7 @@ class ReduceMaxFunctor { View view_; }; +// local QR decomposition // local QR decomposition template class LocalQRDecompFunctor { @@ -51,6 +67,9 @@ class LocalQRDecompFunctor { using shared_matrix = Kokkos::View; using shared_vector = Kokkos::View; + using temp_view_type1 = Kokkos::View; + using temp_view_type = Kokkos::View; + NspType fineNS; NspType coarseNS; aggRowsType aggRows; @@ -62,6 +81,8 @@ class LocalQRDecompFunctor { colsAuxType colsAux; valsAuxType valsAux; bool doQRStep; + temp_view_type1 tau; + temp_view_type work; public: LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) @@ -75,126 +96,126 @@ class LocalQRDecompFunctor { , rowsAux(rowsAux_) , colsAux(colsAux_) , valsAux(valsAux_) - , doQRStep(doQRStep_) {} + , doQRStep(doQRStep_) { + work = temp_view_type("work", aggRows_.extent(0), maxAggDofSize_); + tau = temp_view_type1("tau", fineNS.extent(0)); + } KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type& thread, size_t& nnz) const { + using member_type = typename Kokkos::TeamPolicy::member_type; + auto agg = thread.league_rank(); + const auto aggOffset = aggRows(agg); // size of aggregate: number of DOFs in aggregate - auto aggSize = aggRows(agg + 1) - aggRows(agg); + const auto aggSize = aggRows(agg + 1) - aggOffset; const impl_SC one = impl_ATS::one(); - const impl_SC two = one + one; const impl_SC zero = impl_ATS::zero(); - const auto zeroM = impl_ATS::magnitude(zero); - int m = aggSize; - int n = fineNS.extent(1); + const int m = aggSize; + const int n = fineNS.extent(1); // calculate row offset for coarse nullspace Xpetra::global_size_t offset = agg * n; if (doQRStep) { + // A is m x n + // Q is m x m + // R is m x n + + // A (initially) gets overwritten with R in QR + shared_matrix r(thread.team_shmem(), m, n); + // Q + shared_matrix q(thread.team_shmem(), m, m); + // Extract the piece of the nullspace corresponding to the aggregate - shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end) for (int j = 0; j < n; j++) for (int k = 0; k < m; k++) - r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j); -#if 0 - printf("A\n"); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) - printf(" %5.3lf ", r(i,j)); - printf("\n"); - } + r(k, j) = fineNS(agg2RowMapLO(aggOffset + k), j); + +#ifdef MUELU_DEBUG_QR + Kokkos::printf("\nA %d x %d\n", m, n); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) + Kokkos::printf(" %5.3lf ", r(i, j)); + Kokkos::printf("\n"); + } #endif + thread.team_barrier(); - // Calculate QR decomposition (standard) - shared_matrix q(thread.team_shmem(), m, m); // Q if (m >= n) { - bool isSingular = false; + // team_tau has size n + auto team_tau = Kokkos::subview(tau, Kokkos::make_pair(aggOffset, aggOffset + n)); + KokkosBlas::TeamVectorSet::invoke(thread, zero, team_tau); - // Initialize Q^T - auto qt = q; + // team_work has size m + auto team_work = Kokkos::subview(work, agg, Kokkos::make_pair(0, m)); + KokkosBlas::TeamVectorSet::invoke(thread, zero, team_work); + + // Calculate QR + KokkosBatched::TeamVectorQR::invoke(thread, r, team_tau, team_work); + thread.team_barrier(); + +#ifdef MUELU_DEBUG_QR + Kokkos::printf("\nQR %d x %d\n", m, n); for (int i = 0; i < m; i++) { - for (int j = 0; j < m; j++) - qt(i, j) = zero; - qt(i, i) = one; + for (int j = 0; j < n; j++) + Kokkos::printf(" %5.3lf ", r(i, j)); + Kokkos::printf("\n"); } +#endif - for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize - // FIXME_KOKKOS: use team - Magnitude s = zeroM; - Magnitude norm; - Magnitude norm_x; - for (int i = k + 1; i < m; i++) - s += pow(impl_ATS::magnitude(r(i, k)), 2); - norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); - - if (norm == zero) { - isSingular = true; - break; - } + // Initialize Q to identity + KokkosBatched::TeamSetIdentity::invoke(thread, q); + thread.team_barrier(); - r(k, k) -= norm * one; + // Form Q + KokkosBatched::TeamVectorApplyQ::invoke(thread, r, team_tau, q, team_work); + thread.team_barrier(); - norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s); - if (norm_x == zeroM) { - // We have a single diagonal element in the column. - // No reflections required. Just need to restor r(k,k). - r(k, k) = norm * one; - continue; - } +#ifdef MUELU_DEBUG_QR + Kokkos::printf("\nQ1 %d x %d\n", m, m); + for (int i = 0; i < m; i++) { + for (int j = 0; j < m; j++) + Kokkos::printf(" %5.3lf ", q(i, j)); + Kokkos::printf("\n"); + } - // FIXME_KOKKOS: use team - for (int i = k; i < m; i++) - r(i, k) /= norm_x; - - // Update R(k:m,k+1:n) - for (int j = k + 1; j < n; j++) { - // FIXME_KOKKOS: use team in the loops - impl_SC si = zero; - for (int i = k; i < m; i++) - si += r(i, k) * r(i, j); - for (int i = k; i < m; i++) - r(i, j) -= two * si * r(i, k); - } + // Initialize Q to identity + KokkosBatched::TeamSetIdentity::invoke(thread, q); + thread.team_barrier(); - // Update Q^T (k:m,k:m) - for (int j = k; j < m; j++) { - // FIXME_KOKKOS: use team in the loops - impl_SC si = zero; - for (int i = k; i < m; i++) - si += r(i, k) * qt(i, j); - for (int i = k; i < m; i++) - qt(i, j) -= two * si * r(i, k); - } + for (int k = 0; k < m; ++k) { + auto q_col = Kokkos::subview(q, Kokkos::ALL(), k); + KokkosBatched::TeamVectorApplyQ::invoke(thread, r, team_tau, q_col, team_work); + } + + Kokkos::printf("\nQ2 %d x %d\n", m, m); + for (int i = 0; i < m; i++) { + for (int j = 0; j < m; j++) + Kokkos::printf(" %5.3lf ", q(i, j)); + Kokkos::printf("\n"); + } - // Fix R(k:m,k) - r(k, k) = norm * one; - for (int i = k + 1; i < m; i++) - r(i, k) = zero; + Kokkos::printf("\ntau %d \n", n); + for (int i = 0; i < n; i++) { + Kokkos::printf(" %5.3lf ", team_tau(i)); + Kokkos::printf("\n"); } -#if 0 - // Q = (Q^T)^T - for (int i = 0; i < m; i++) - for (int j = 0; j < i; j++) { - impl_SC tmp = qt(i,j); - qt(i,j) = qt(j,i); - qt(j,i) = tmp; - } + Kokkos::printf("\work %d \n", team_work.extent(0)); + for (int i = 0; i < team_work.extent(0); i++) { + Kokkos::printf(" %5.3lf ", team_work(i)); + Kokkos::printf("\n"); + } #endif // Build coarse nullspace using the upper triangular part of R - for (int j = 0; j < n; j++) - for (int k = 0; k <= j; k++) - coarseNS(offset + k, j) = r(k, j); - - if (isSingular) { - statusAtomic(1) = true; - return; + for (int j = 0; j < n; j++) { + for (int k = 0; k < n; k++) + coarseNS(offset + k, j) = (k <= j) ? r(k, j) : zero; } } else { @@ -241,6 +262,23 @@ class LocalQRDecompFunctor { q(i, j) = (j == i ? one : zero); } +#ifdef MUELU_DEBUG_QR + // Kokkos::printf("\nQ3 %d x %d\n", m, m); + // for (int i = 0; i < m; i++) { + // for (int j = 0; j < m; j++) + // Kokkos::printf(" %5.3lf ", q(i, j)); + // Kokkos::printf("\n"); + // } + + Kokkos::printf("\nR %d x %d\n", m, n); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) + Kokkos::printf(" %5.3lf ", i <= j ? r(i, j) : zero); + Kokkos::printf("\n"); + } + +#endif + // Process each row in the local Q factor and fill helper arrays to assemble P for (int j = 0; j < m; j++) { LO localRow = agg2RowMapLO(aggRows(agg) + j); @@ -257,22 +295,6 @@ class LocalQRDecompFunctor { rows(localRow + 1) = lnnz; nnz += lnnz; } - -#if 0 - printf("R\n"); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) - printf(" %5.3lf ", coarseNS(i,j)); - printf("\n"); - } - - printf("Q\n"); - for (int i = 0; i < aggSize; i++) { - for (int j = 0; j < aggSize; j++) - printf(" %5.3lf ", q(i,j)); - printf("\n"); - } -#endif } else { ///////////////////////////// // "no-QR" option // diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp index 32c6c9487004..8531d6d3bee2 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp @@ -440,11 +440,7 @@ void TentativePFactory_kokkos:: KOKKOS_LAMBDA(const LO row) { rowsAux(row) = row * NSDim; }); - Kokkos::parallel_for( - "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate), - KOKKOS_LAMBDA(const LO j) { - colsAux(j) = INVALID; - }); + Kokkos::deep_copy(colsAux, INVALID); } if (NSDim == 1) { diff --git a/packages/muelu/test/unit_tests_kokkos/TentativePFactory_kokkos.cpp b/packages/muelu/test/unit_tests_kokkos/TentativePFactory_kokkos.cpp index 2864e357ac6d..1e52d7c71e47 100644 --- a/packages/muelu/test/unit_tests_kokkos/TentativePFactory_kokkos.cpp +++ b/packages/muelu/test/unit_tests_kokkos/TentativePFactory_kokkos.cpp @@ -286,6 +286,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(TentativePFactory_kokkos, MakeTentativeVectorB // coarseNullspace->describe(out, Teuchos::VERB_EXTREME); + // Ptent->describe(out, Teuchos::VERB_EXTREME); + // Check interpolation by computing ||fineNS - P*coarseNS|| auto PtN = MultiVectorFactory::Build(Ptent->getRangeMap(), NSdim); Ptent->apply(*coarseNullspace, *PtN, Teuchos::NO_TRANS, 1.0, 0.0); From 060a9977bac38547eb5a9e17ee808307fde0c219 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Wed, 18 Sep 2024 15:40:30 -0600 Subject: [PATCH 4/4] Kokkos Kernels: Fix for batched QR Signed-off-by: Christian Glusa --- .../batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp | 2 +- .../batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 95ca1c4340d1..4988b1ed60e4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -53,7 +53,7 @@ struct SerialQR_Internal { A_part2x2.partWithATL(A, m, n, 0, 0); t_part2x1.partWithAT(t, m, 0); - for (int m_atl = 0; m_atl < m; ++m_atl) { + for (int m_atl = 0; m_atl < Kokkos::min(m, n); ++m_atl) { // part 2x2 into 3x3 A_part3x3.partWithABR(A_part2x2, 1, 1); const int m_A22 = m - m_atl - 1; diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp index e3dde679865a..c6d97ee6f7b2 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp @@ -54,7 +54,7 @@ struct TeamVectorQR_Internal { A_part2x2.partWithATL(A, m, n, 0, 0); t_part2x1.partWithAT(t, m, 0); - for (int m_atl = 0; m_atl < m; ++m_atl) { + for (int m_atl = 0; m_atl < Kokkos::min(m, n); ++m_atl) { // part 2x2 into 3x3 A_part3x3.partWithABR(A_part2x2, 1, 1); const int m_A22 = m - m_atl - 1;