diff --git a/Eigen/SparseCholesky b/Eigen/SparseCholesky index 6abdcd6601cc85d15fd20419bd28af2391907d0b..56bd0252c07caa2f42382a8c8b075440c9f2515f 100644 --- a/Eigen/SparseCholesky +++ b/Eigen/SparseCholesky @@ -33,6 +33,8 @@ // IWYU pragma: begin_exports #include "src/SparseCholesky/SimplicialCholesky.h" #include "src/SparseCholesky/SimplicialCholesky_impl.h" +#include "src/SparseCholesky/SupernodalCholesky.h" +#include "src/SparseCholesky/SupernodalCholesky_impl.h" // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/SparseCholesky/SupernodalCholesky.h b/Eigen/src/SparseCholesky/SupernodalCholesky.h new file mode 100644 index 0000000000000000000000000000000000000000..a12e168efb708a3217c107ddbcac8a81ead0f779 --- /dev/null +++ b/Eigen/src/SparseCholesky/SupernodalCholesky.h @@ -0,0 +1,166 @@ +#ifndef EIGEN_SUPERNODAL_CHOLESKY_H +#define EIGEN_SUPERNODAL_CHOLESKY_H + +namespace Eigen { + +template +class SupernodalCholeskyLLT : public SparseSolverBase> { + using Self = SupernodalCholeskyLLT; + using Base = SparseSolverBase; + using Base::m_isInitialized; + + public: + using MatrixType = MatrixType_; + enum { UpLo = UpLo_ }; + using Scalar = typename MatrixType::Scalar; + using RealScalar = typename MatrixType::RealScalar; + using StorageIndex = typename MatrixType::StorageIndex; + + using Helper = Eigen::internal::simpl_chol_helper; + using HelperSN = Eigen::internal::supernodal_chol_helper; + + using VectorType = Matrix; + using VectorI = Matrix; + + using MatrixL = SparseMatrix; + using MatrixU = SparseMatrix; + + using CholMatrixType = SparseMatrix; + + SupernodalCholeskyLLT() = default; + + SupernodalCholeskyLLT& setShift(Scalar beta) { + m_beta = beta; + return *this; + } + + SupernodalCholeskyLLT& analyzePattern(const MatrixType& A) { + clear_(); + m_info = Eigen::Success; + + MatrixType Au = A.template triangularView(); + Au.makeCompressed(); + + const StorageIndex n = static_cast(Au.cols()); + m_n = n; + + VectorI hadjOuter(n + 1); + hadjOuter.setZero(); + VectorI tmp(n); + Helper::calc_hadj_outer(n, A, hadjOuter.data()); + VectorI hadjInner(hadjOuter[n]); + Helper::calc_hadj_inner(n, A, hadjOuter.data(), hadjInner.data(), tmp.data()); + + m_parent.resize(n); + Helper::calc_etree(n, A, m_parent.data(), tmp.data()); + + VectorI firstChild(n), firstSibling(n), post(n), dfs(n); + Helper::calc_lineage(n, m_parent.data(), firstChild.data(), firstSibling.data()); + Helper::calc_post(n, m_parent.data(), firstChild.data(), firstSibling.data(), post.data(), dfs.data()); + + VectorI prevLeaf(n); + prevLeaf.setConstant(Helper::kEmpty); + m_colcount.resize(n); + Helper::calc_colcount(n, hadjOuter.data(), hadjInner.data(), m_parent.data(), prevLeaf.data(), tmp.data(), + post.data(), m_colcount.data(), false); + + m_supe = HelperSN::compute_supernodes(m_parent, m_colcount); + m_s_parent = HelperSN::compute_supernodal_etree(m_parent, m_supe); + + HelperSN::allocate_L(m_supe, m_Lpx, m_Li, m_Lpi, m_ssize, m_xsize); + + VectorI ApVec(n + 1); + ApVec = Eigen::Map(Au.outerIndexPtr(), n + 1); + VectorI AiVec(Au.nonZeros()); + AiVec = Eigen::Map(Au.innerIndexPtr(), Au.nonZeros()); + + HelperSN::build_sn_pattern(m_supe, m_s_parent, ApVec, AiVec, nullptr, m_Li, m_Lpi); + + m_Lx.resize(m_Lpx[m_supe.n_supernodes]); + m_symbolic_ok = true; + return *this; + } + + SupernodalCholeskyLLT& factorize(const MatrixType& A) { + m_numeric_ok = false; + m_info = Eigen::Success; + + if (!m_symbolic_ok) { + m_info = Eigen::InvalidInput; + return *this; + } + + MatrixType Al = A.template triangularView(); + Al.makeCompressed(); + + bool ok = HelperSN::compute_numeric(Al, m_supe, m_Lpi, m_Lpx, m_Li, m_beta, m_Lx); + if (!ok) { + m_info = Eigen::NumericalIssue; + return *this; + } + + m_L = HelperSN::export_sparse(m_supe, m_Lpi, m_Lpx, m_Li, m_Lx, m_n); + + // todo: we could resize/free everything here except m_L + + m_numeric_ok = true; + return *this; + } + + SupernodalCholeskyLLT& compute(const MatrixType& A) { return analyzePattern(A).factorize(A); } + + const MatrixL& matrixL() const { return m_L; } + ComputationInfo info() const { return m_info; } + + private: + void clear_() { + m_L.resize(0, 0); + m_parent.resize(0); + m_colcount.resize(0); + m_supe.supernodes.resize(0); + m_supe.sn_id.resize(0); + m_supe.Snz.resize(0); + m_s_parent.resize(0); + m_Lpi.resize(0); + m_Lpx.resize(0); + m_Li.resize(0); + m_Lx.resize(0); + m_symbolic_ok = false; + m_numeric_ok = false; + m_n = 0; + m_ssize = m_xsize = 0; + } + + MatrixL m_L; + VectorI m_parent; + VectorI m_colcount; + + HelperSN::Supernodes m_supe; + + VectorI m_s_parent; + VectorI m_Lpi, m_Lpx, m_Li; + VectorType m_Lx; + StorageIndex m_n = 0, m_ssize = 0, m_xsize = 0; + Scalar m_beta = Scalar(0); + bool m_symbolic_ok = false, m_numeric_ok = false; + ComputationInfo m_info = Success; +}; + +namespace internal { + +template +struct traits> { + using MatrixType = _MatrixType; + using Scalar = typename MatrixType::Scalar; + using StorageIndex = typename MatrixType::StorageIndex; + enum { UpLo = _UpLo }; + using OrderingType = _Ordering; + + using MatrixL = SparseMatrix; + using MatrixU = SparseMatrix; +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_SUPERNODAL_CHOLESKY_H \ No newline at end of file diff --git a/Eigen/src/SparseCholesky/SupernodalCholesky_impl.h b/Eigen/src/SparseCholesky/SupernodalCholesky_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..b2c140b80a8a2293bb7de84dad280e7f65bebcbe --- /dev/null +++ b/Eigen/src/SparseCholesky/SupernodalCholesky_impl.h @@ -0,0 +1,845 @@ +#ifndef EIGEN_SUPERNODAL_CHOLESKY_IMPL_H +#define EIGEN_SUPERNODAL_CHOLESKY_IMPL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +struct supernodal_chol_helper { + using CholMatrixType = SparseMatrix; + using InnerIterator = typename CholMatrixType::InnerIterator; + using VectorI = Matrix; + + /// means the values is empty. + /// in the etrees this means the column is a root + static constexpr StorageIndex kEmpty = -1; + + struct Supernodes { + /// boundaries of each supernode (`size = #supernodes + 1`) + VectorI supernodes; + + /// id of the final supernode each column belongs to + /// (`size = #columns`) + VectorI sn_id; + + /// #nonzero rows in each supernode. + VectorI Snz; + + /// #supernodes + StorageIndex n_supernodes; + }; + + // Supernode Algorithm Adapted from CHOLMOD + // Y. Chen, T. A. Davis, W. W. Hager, and S. Rajamanickam, + // Algorithm 887: + // CHOLMOD, supernodal sparse Cholesky factorization and update/downdate, + // ACM Trans. on Mathematical Software, 35(3), 2008, pp. 22:1--22:14. + // https://dl.acm.org/doi/abs/10.1145/1391989.1391995 + + // T. A. Davis and W. W. Hager, + // Dynamic supernodes in sparse Cholesky update/downdate and triangular solves, + // ACM Trans. on Mathematical Software, 35(4), 2009, pp. 27:1--27:23. + // https://doi.org/10.1145/1462173.1462176 + + struct RelaxParams { + StorageIndex nrelax[3] = {4, 16, 48}; + double zrelax[3] = {0.8, 0.1, 0.05}; + }; + + static bool merge_supernodes(const StorageIndex s, const StorageIndex ns, const VectorI& Snz, + const StorageIndex& nscol0, const StorageIndex& nscol1, StorageIndex& totzeros) { + RelaxParams params; + + // Always merge if the resulting supernode is small enough + if (ns <= params.nrelax[0]) return true; + + // Estimate additional nonzeros if we append `s` on the left + /// leading column nonzeros in the left supernode + const double l_nnz_0 = static_cast(Snz[s]); + + /// leading column nonzeros in the right supernode + const double l_nnz_1 = static_cast(Snz[s + 1]); + + /// the estimated number of new zero entries introduced if you force the left block + // to share the same structure as the right block in a merged supernode. + const double xnewzeros = static_cast(params.nrelax[0]) * (l_nnz_1 + nscol0 - l_nnz_0); + + const StorageIndex newzeros_ll = 1LL * nscol0 * (Snz[s + 1] + nscol0 - Snz[s]); + + if (xnewzeros == 0.0) { + // no new nonzero rows so we merge + return true; + } + /// total artificial zeros after the merge. + const double xtotzeros = static_cast(totzeros) + xnewzeros; + + /// merged width + const double xns = static_cast(ns); + + const double xtotsize = (xns * (xns + 1.0) / 2.0) + xns * (l_nnz_1 - nscol1); + + /// fill ratio after the merge; how many artificial nonzeros pop up from the merge. + /// larger z means more sparse + const double z = xtotzeros / std::max(1.0, xtotsize); + + const double int_guard = static_cast(std::numeric_limits::max()) / sizeof(double); + + totzeros += static_cast(newzeros_ll); + + if (xtotsize < int_guard && ((ns <= params.nrelax[1] && z < params.zrelax[0]) || + (ns <= params.nrelax[2] && z < params.zrelax[1]) || (z < params.zrelax[2]))) { + return true; + } + + return false; + } + + static Supernodes compute_supernodes(const VectorI& parent, const VectorI& colcount) { + const StorageIndex n = parent.size(); + eigen_assert(colcount.size() == n && "colcount must have size n"); + + if (n == 0) { + return Supernodes{VectorI::Zero(1), VectorI::Zero(0), VectorI::Zero(0)}; + } + + /// id of the final supernode each column belongs to + VectorI sn_id = VectorI::Constant(n, kEmpty); + + VectorI supernodes; + + // Fundamental supernodes + + /// #children of each node + VectorI childCount = VectorI::Zero(n); + + for (StorageIndex j = 0; j < n; ++j) + // make sure its not a root node + if (parent[j] != kEmpty) childCount[parent[j]]++; + + // fundamental supernode test: + // (1) parent[j] == j + 1 (j+1 is parent of j in etree) + // (2) nnz(L(:,j)) - nnz(L(:,j-1)) == 1 + // (they have the same # of nonzeros below the diagonal + // minus 1 for the diagonal) + // (3) j has at most one child (to ensure a chain of nodes) + // once these three conditions are met then we know they have the same + // sparsity structure below the diagonal. Since the principle of column + // replication ensures that the sparsity pattern of column j is replicated + // into column j + 1 if j + 1 is the parent of j + + /// fundamental supernodes boundaries + VectorI fn_supernodes; + + fn_supernodes.resize(n + 1); + StorageIndex m = 0; + + fn_supernodes[m++] = 0; + for (StorageIndex j = 1; j < n; ++j) { + const bool parentChain = (parent[j - 1] == j); + const bool colNest = (colcount[j - 1] == colcount[j] + 1); + const bool singleChild = (childCount[j] <= 1); + if (!(parentChain && colNest && singleChild)) { + // j is the leading node of a supernode + fn_supernodes[m++] = j; + } + } + fn_supernodes[m++] = n; + fn_supernodes.conservativeResize(m); + + // TODO: we no longer need childCount so I should think about whether I want to free it? + // childCount = VectorI(); + + /// #fundamental supernodes + const StorageIndex nfsuper = static_cast(fn_supernodes.size()) - 1; + + /// id of the fundamental supernode each column belongs to + VectorI fundamental_sn_id(n); + + for (StorageIndex s = 0; s < nfsuper; ++s) + for (StorageIndex k = fn_supernodes[s]; k < fn_supernodes[s + 1]; ++k) fundamental_sn_id[k] = s; + + /// Fundamental supernodal etree (ie: assembly tree) + VectorI f_sn_etree = VectorI::Constant(nfsuper, kEmpty); + + for (int s = 0; s < nfsuper; ++s) { + const int jlast = fn_supernodes[s + 1] - 1; + const int pcol = parent[jlast]; + f_sn_etree[s] = (pcol == kEmpty) ? kEmpty : fundamental_sn_id[pcol]; + } + + // Relaxed amalgamation + // Here we want to merge neighboring fundamental supernodes + // while ensuring that the resulting supernode is not too sparse. + + // Default parameters from CHOLMOD + // https://github.com/DrTimothyAldenDavis/SuiteSparse/blob/d558c83006d63d1dc62004f30042b3ca484f3f94/CHOLMOD/Utility/t_cholmod_defaults.c#L54-L59 + constexpr StorageIndex nrelax0 = 4; + constexpr StorageIndex nrelax1 = 16; + constexpr StorageIndex nrelax2 = 48; + constexpr double zrelax0 = 0.8; + constexpr double zrelax1 = 0.1; + constexpr double zrelax2 = 0.05; + + /// #columns in each fundamental supernode + VectorI Nscol(nfsuper); + + /// #nonzero rows in each fundamental supernode. + /// at the start this is just the nnz of the leading column + VectorI Snz(nfsuper); + + /// accumulated new zeros introduced by earlier merges (starts at 0). + VectorI Zeros = VectorI::Zero(nfsuper); + + VectorI Merged = VectorI::Constant(nfsuper, kEmpty); + + for (StorageIndex s = 0; s < nfsuper; ++s) { + Nscol[s] = fn_supernodes[s + 1] - fn_supernodes[s]; // width + eigen_assert(s <= fn_supernodes[s] && "supernode index is invalid"); + Snz[s] = colcount[fn_supernodes[s]]; // nnz of leading col + } + + for (StorageIndex s = nfsuper - 2; s >= 0; --s) { + StorageIndex ss = f_sn_etree[s]; + + if (ss == kEmpty) continue; + + while (Merged[ss] != kEmpty) ss = Merged[ss]; + + const StorageIndex sparent = ss; + + for (ss = f_sn_etree[s]; ss != kEmpty && Merged[ss] != kEmpty;) { + const StorageIndex snext = Merged[ss]; + Merged[ss] = sparent; + ss = snext; + } + + if (sparent != s + 1) continue; + + /// width of the left supernode + const StorageIndex nscol0 = Nscol[s]; + + // width of the right supernode + const StorageIndex nscol1 = Nscol[s + 1]; + + const StorageIndex ns = nscol0 + nscol1; + + /// zeros accumulated in the right block + StorageIndex totzeros = Zeros[s + 1]; + + /// leading column nonzeros in the right supernode + const double l_nnz_1 = static_cast(Snz[s + 1]); + + const bool merge = merge_supernodes(s, ns, Snz, nscol0, nscol1, totzeros); + + if (merge) { + Zeros[s] = totzeros; + Merged[s + 1] = s; + Snz[s] = nscol0 + Snz[s + 1]; + Nscol[s] += Nscol[s + 1]; + } + } + + // Emit relaxed supernodes + + /// Count how many supernodes + StorageIndex n_supernodes = 1; + + for (StorageIndex s = 0; s < nfsuper; ++s) + if (Merged[s] == kEmpty) n_supernodes++; + + VectorI relaxed(n_supernodes); + + StorageIndex r = 0; + for (StorageIndex s = 0; s < nfsuper; ++s) + if (Merged[s] == kEmpty) relaxed[r++] = fn_supernodes[s]; + + relaxed[r++] = n; + + supernodes.resize(relaxed.size()); + for (StorageIndex i = 0; i < supernodes.size(); ++i) supernodes[i] = relaxed[static_cast(i)]; + + for (StorageIndex s = 0; s + 1 < relaxed.size(); ++s) + for (StorageIndex k = relaxed[s]; k < relaxed[s + 1]; ++k) sn_id[k] = s; + + Supernodes out; + out.supernodes = std::move(supernodes); + out.sn_id = std::move(sn_id); + out.n_supernodes = out.supernodes.size() - 1; + + // Build compact Snz + VectorI Snz_relaxed(relaxed.size() - 1); + StorageIndex t = 0; + for (StorageIndex s = 0; s < nfsuper; ++s) { + if (Merged[s] == kEmpty) { + Snz_relaxed[t++] = Snz[s]; + } + } + out.Snz = std::move(Snz_relaxed); + return out; + } + + static VectorI compute_supernodal_etree(const VectorI& parent, const Supernodes& supe) { + VectorI s_parent = VectorI::Constant(supe.n_supernodes, kEmpty); + + for (StorageIndex s = 0; s < supe.n_supernodes; ++s) { + const StorageIndex jlast = supe.supernodes[s + 1] - 1; + const StorageIndex pcol = parent[jlast]; + s_parent[s] = (pcol < 0) ? kEmpty : supe.sn_id[pcol]; + } + return s_parent; + } + + /// Lp (size `# supernodes + 1`) + /// Li List of row indices used by a super node `s` + /// Li[Lp[s] ... Lp[s+1]-1] + + /** + * @brief finds which supernodes column `j` contributes to in the factor `L` + * + * @param j the column we are building the pattern for + * @param Ap column pointers of A + * @param Ai row indices of A + * @param Anz + * @param sn_id column to supernode mapping + * @param s_parent supernodal elimination tree + * @param mark + * @param k1 supernode start column (k1 <= j) + * @param flag + * @param Li List of row indices used by supernodes + * @param Lp supernode column pointers (size `# supernodes + 1`) + */ + static void sn_contribution(StorageIndex j, const VectorI& Ap, const VectorI& Ai, const VectorI* Anz, + const VectorI& sn_id, const VectorI& s_parent, StorageIndex mark, StorageIndex k1, + VectorI& flag, VectorI& Li, VectorI& Lp) { + StorageIndex p = Ap[j]; + const StorageIndex pend = (Anz == nullptr) ? Ap[j + 1] : (p + (*Anz)[j]); + + for (; p < pend; ++p) { + const StorageIndex i = Ai[p]; + + if (i < k1) { + // climb up supernodal etree + for (StorageIndex si = sn_id[i]; flag[si] < mark; si = s_parent[si]) { + Li[Lp[si]++] = j; + flag[si] = mark; + } + } else { + break; + } + } + } + + /** + * @brief computes sizes of `Li` and `Lx` + * + * @param supe supernode structure + * @param[out] ssize output size of Li + * @param[out] xsize output size of Lx + */ + static void compute_supernodal_sizes(const Supernodes& supe, StorageIndex& ssize, StorageIndex& xsize) { + const StorageIndex n_supernodes = supe.n_supernodes; + + ssize = 0; + xsize = 0; + + for (StorageIndex s = 0; s < supe.n_supernodes; ++s) { + /// number of columns in this supernode + uint64_t nscol = static_cast(supe.supernodes[s + 1] - supe.supernodes[s]); + + /// number of rows (including diagonal block) + uint64_t nsrow = static_cast(supe.Snz[s]); + + // add to ssize (row index pool length) + ssize += static_cast(nsrow); + + /// size of supernodal block + uint64_t c = nscol * nsrow; + + xsize += static_cast(c); + } + + // ensure results fit w/i limits + eigen_assert(ssize >= 0 && ssize <= std::numeric_limits::max() && "Li size too large"); + eigen_assert(xsize >= 0 && xsize <= std::numeric_limits::max() && "Lx size too large"); + } + + // Lx = data (dense blocks). + // Lpx = per-supernode pointers into that data (size n_supernodes+1). + // Li = pattern (row indices). + // Lpi = per-supernode pointers into that pattern (size n_supernodes+1). + + /** + * @brief allocate `L` + * + * @param supe supernode structure + * @param[out] Lpx per-supernode pointers into that data (size n_supernodes+1) + * @param[out] Li pattern (row indices) (size `nsrows[s]` for supernode `s`) + * @param[out] Lpi per-supernode pointers into that pattern (size n_supernodes+1) + * @param[out] ssize total size of `Li` + * @param[out] xsize total size of `Lx` + */ + static void allocate_L(const Supernodes& supe, VectorI& Lpx, VectorI& Li, VectorI& Lpi, StorageIndex& ssize, + StorageIndex& xsize) { + // sizes + compute_supernodal_sizes(supe, ssize, xsize); + + Lpi.resize(supe.n_supernodes + 1); + Lpx.resize(supe.n_supernodes + 1); + Li.resize(ssize); + + // fill column pointers (Lpi) + { + StorageIndex p = 0; + for (StorageIndex s = 0; s < supe.n_supernodes; ++s) { + Lpi[s] = p; + p += supe.Snz[s]; + } + Lpi[supe.n_supernodes] = p; + eigen_assert(p == ssize); + } + + // fill block value pointers (Lpx) + { + StorageIndex q = 0; + for (StorageIndex s = 0; s < supe.n_supernodes; ++s) { + const StorageIndex nscol = supe.supernodes[s + 1] - supe.supernodes[s]; + const StorageIndex nsrow = supe.Snz[s]; + Lpx[s] = q; + q += nscol * nsrow; + } + Lpx[supe.n_supernodes] = q; + eigen_assert(q == xsize); + } + } + + /** + * @brief Fill the supernodal pattern `Li`. + * + * @param supe supernode structure + * @param s_parent supernodal etree (size n_supernodes) + * @param Ap, Ai CSC of A + * @param Anz per-column counts + * @param Li row indices pool (allocated) + * @param Lpi per-supernode pointers into Li + */ + static void build_sn_pattern(const Supernodes& supe, const VectorI& s_parent, const VectorI& Ap, const VectorI& Ai, + const VectorI* Anz, VectorI& Li, const VectorI& Lpi) { + const StorageIndex n_supernodes = supe.n_supernodes; + + /// working copy of Lpi + VectorI Lpi2 = Lpi; + + /// per-supernode flag array + VectorI flag = VectorI::Zero(supe.n_supernodes); + StorageIndex mark = 1; + + auto clear_flag = [&]() { + if (mark == std::numeric_limits::max()) { + flag.setZero(); + mark = 1; + } else { + ++mark; + } + }; + + for (StorageIndex s = 0; s < n_supernodes; ++s) { + const StorageIndex k1 = supe.supernodes[s]; + const StorageIndex k2 = supe.supernodes[s + 1]; + + // put rows `k1...k2-1` in leading column of supernode `s` + for (StorageIndex k = k1; k < k2; ++k) { + Li[Lpi2[s]++] = k; + } + + // traverse for each column k in this supernode + for (StorageIndex k = k1; k < k2; ++k) { + clear_flag(); + flag[s] = mark; // mark this supernode + sn_contribution(k, Ap, Ai, Anz, supe.sn_id, s_parent, mark, k1, flag, Li, Lpi2); + } + } + +#ifndef EIGEN_NO_DEBUG + for (StorageIndex s = 0; s < supe.n_supernodes; ++s) { + eigen_assert(Lpi2[s] == Lpi[s + 1] && "Li fill mismatch"); + } +#endif + } + + /** + * @brief Compute largest update matrix + * + * @param supe + * @param Lpi + * @param Li + * @param[out] maxcsize largest panel area among all updates + * @param[out] maxesize largest esize = (# extra rows beyond diagonal block) for any supernode + */ + static void compute_max_update(const Supernodes& supe, const VectorI& Lpi, const VectorI& Li, StorageIndex& maxcsize, + StorageIndex& maxesize) { + const StorageIndex n_supernodes = supe.n_supernodes; + eigen_assert(supe.Snz.size() == n_supernodes); + eigen_assert(Lpi.size() == n_supernodes + 1); + + maxcsize = StorageIndex(1); + maxesize = StorageIndex(1); + + for (StorageIndex d = 0; d < n_supernodes; ++d) { + /// number of supernode columns + const StorageIndex nscol = supe.supernodes[d + 1] - supe.supernodes[d]; + + /// Start of extra rows (after diagonal block) + StorageIndex p = Lpi[d] + nscol; + + StorageIndex plast = p; + + const StorageIndex pend = Lpi[d + 1]; + + /// number of extra rows used in solves for this supernode + const StorageIndex esize = pend - p; + + if (esize > maxesize) maxesize = esize; + + // If no extra rows skip + if (p == pend) continue; + + // Current group label = supernode id of the first extra-row index + StorageIndex slast = supe.sn_id[Li[p]]; + + // Walk until pend + for (; p <= pend; ++p) { + const StorageIndex s = (p == pend) ? kEmpty : supe.sn_id[Li[p]]; + + if (s != slast) { + /// number of rows in this group + const StorageIndex ndrow1 = p - plast; + + /// number of rows in the trailing panel + const StorageIndex ndrow2 = pend - plast; + + const uint64_t c = uint64_t(ndrow1) * uint64_t(ndrow2); + const uint64_t cap = uint64_t(std::numeric_limits::max()); + + const StorageIndex csize = StorageIndex(c > cap ? cap : c); + + if (csize > maxcsize) maxcsize = csize; + + plast = p; + slast = s; + } + } + } + } + + /** + * @brief + * + * @param A + * @param supe supernode struct + * @param Lpi pointers into Li (size n_supernodes+1) + * @param Lpx pointers into Lx (size n_supernodes+1) + * @param Li row indices per supernode (size = Lpi[n_supernodes]) + * @param beta + * @param[out] Lx dense storage for all supernodes (size = Lpx[n_supernodes]) + */ + static bool compute_numeric(const CholMatrixType& A, const Supernodes& supe, const VectorI& Lpi, const VectorI& Lpx, + const VectorI& Li, Scalar beta, Matrix& Lx) { + using std::max; + + const VectorI& supernodes = supe.supernodes; + const VectorI& Snz = supe.Snz; + const VectorI& sn_id = supe.sn_id; + + const StorageIndex n = static_cast(A.cols()); + const StorageIndex n_supernodes = static_cast(supernodes.size()) - 1; + eigen_assert(static_cast(A.rows()) == n); + + // allocate numeric storage + Lx.resize(Lpx[n_supernodes]); + Lx.setZero(); + + /// Map[i] -> local row within current supernode s (or -1) + VectorI Map(n); + Map.setConstant(kEmpty); + + /// head of child list for each supernode + VectorI head(n_supernodes); + head.setConstant(kEmpty); + + /// sibling link for child list + VectorI next(n_supernodes); + next.setConstant(kEmpty); + + /// where child d resumes for next ancestor + VectorI Lpos(n_supernodes); + Lpos.setZero(); + + // figure out a safe scratch size for C and for RelativeMap + StorageIndex maxcsize = 1, maxesize = 1; + compute_max_update(supe, Lpi, Li, maxcsize, maxesize); + + std::vector Cbuf(static_cast(maxcsize)); + + // strided map type (outer stride = leading dimension) + using DenseMat = Matrix; + using StrideType = Eigen::Stride; + using BlockMap = Eigen::Map; + + // main supernode loop + for (StorageIndex s = 0; s < n_supernodes; ++s) { + const StorageIndex k1 = supernodes[s]; + const StorageIndex k2 = supernodes[s + 1]; + + /// width of supernode + const StorageIndex nscol = k2 - k1; + + /// pointer to first row of s in Li + const StorageIndex psi = Lpi[s]; + + /// pointer to first row of s in Lx + const StorageIndex psx = Lpx[s]; + + /// #nonzero rows in this s + const StorageIndex nsrow = Snz[s]; + + // build Map for the rows of this supernode + for (StorageIndex k = 0; k < nsrow; ++k) Map[Li[psi + k]] = k; + + // copy A(:, k1:k2-1) into dense block (lower part only) + for (StorageIndex k = k1; k < k2; ++k) { + for (typename CholMatrixType::InnerIterator it(A, k); it; ++it) { + const StorageIndex i = static_cast(it.row()); + + // keep lower triangle including diag + if (i < k) continue; + + const StorageIndex imap = Map[i]; + + if (imap >= 0 && imap < nsrow) { + Lx[psx + (k - k1) * nsrow + imap] += it.value(); + } + } + } + + // add beta to the diagonal of the supernode (top-left nscol-by-nscol block) + if (beta != Scalar(0)) { + StorageIndex pk = psx; + for (StorageIndex j = 0; j < nscol; ++j) { + Lx[pk] += beta; + pk += nsrow + 1; + } + } + + // Apply updates from children already factorized and queued at head[s] + StorageIndex dnext = head[s]; + while (dnext != kEmpty) { + const StorageIndex d = dnext; + + // preserve next pointer + dnext = next[d]; + + /// width of child + const StorageIndex ndcol = supernodes[d + 1] - supernodes[d]; + + const StorageIndex pdi = Lpi[d]; + const StorageIndex pdend = Lpi[d + 1]; + const StorageIndex pdx = Lpx[d]; + + /// total rows in child + const StorageIndex ndrow = pdend - pdi; + + const StorageIndex p = Lpos[d]; + + /// first relevant row (global index Li[pdi1]) + StorageIndex pdi1 = pdi + p; + StorageIndex pdi2 = pdi1; + while (pdi2 < pdend && Li[pdi2] < k2) ++pdi2; + + /// rows within [k1..k2) + const StorageIndex ndrow1 = pdi2 - pdi1; + + /// rows in [k1..n) + const StorageIndex ndrow2 = pdend - pdi1; + + /// rows strictly below k2 + const StorageIndex ndrow3 = ndrow2 - ndrow1; + + if (ndrow1 <= 0) continue; + + /// Map the child's dense block Ld (size ndrow by ndcol, leading dim = ndrow) + BlockMap Ld(Lx.data() + pdx, ndrow, ndcol, StrideType(ndrow, 1)); + + // Subblocks relative to p + /// rows in [k1..k2) + auto L1 = Ld.block(p, 0, ndrow1, ndcol); + + /// rows in (k2..n) + auto L2 = Ld.block(p + ndrow1, 0, ndrow3, ndcol); + + // Build C (ndrow2 by ndrow1) (col-major) + Eigen::Map C(Cbuf.data(), ndrow2, ndrow1); + if (ndrow1 > 0) { + // C1 = L1 * L1' + C.topRows(ndrow1).noalias() = L1 * L1.adjoint(); + if (ndrow3 > 0) { + // C2 = L2 * L1' + C.bottomRows(ndrow3).noalias() = L2 * L1.adjoint(); + } + } + + /// where each row of C lands within current parent s + VectorI RelativeMap(maxesize); + + for (StorageIndex i = 0; i < ndrow2; ++i) { + const StorageIndex gi = Li[pdi1 + i]; // global row index + RelativeMap[i] = Map[gi]; // local row in s + eigen_assert(RelativeMap[i] >= 0 && RelativeMap[i] < nsrow); + } + + // Assemble: L_s( RelativeMap[i], RelativeMap[j] ) -= C(i,j) + BlockMap Sblock(Lx.data() + psx, nsrow, nscol, StrideType(nsrow, 1)); + for (StorageIndex j = 0; j < ndrow1; ++j) { + const StorageIndex cj = RelativeMap[j]; // parent column index (within s) + eigen_assert(cj >= 0 && cj < nscol); + for (StorageIndex i = j; i < ndrow2; ++i) { + const StorageIndex ri = RelativeMap[i]; // parent row index (within s) + eigen_assert(ri >= j && ri < nsrow); + Sblock(ri, cj) -= C(i, j); + } + } + + // Prepare child d for its next ancestor + Lpos[d] = pdi2 - pdi; // skip rows already used + if (Lpos[d] < ndrow) { + const StorageIndex dancestor = supe.sn_id[Li[pdi2]]; + // enqueue at head of dancestor + next[d] = head[dancestor]; + head[dancestor] = d; + } + } // while children + + // Calculate A_{diag} = L_{diag} L_{diag}^T + // Factorize diagonal block S1 (nscol by nscol) and overwrite with its Cholesky L1 + // Map to a square view with outer stride = nsrow + BlockMap Sfull(Lx.data() + psx, nsrow, nscol, StrideType(nsrow, 1)); + BlockMap A_diag(Lx.data() + psx, nscol, nscol, StrideType(nsrow, 1)); // top nscol rows + + // Compute A_diag = L_diag * L_diag^T + // We know what A_diag is so we can find L_diag using the Cholesky + Eigen::LLT llt; + DenseMat S1copy = A_diag; + llt.compute(S1copy); + if (llt.info() != Eigen::Success) { + // err: not SPD + return false; + } + A_diag.setZero(); + A_diag.template triangularView() = llt.matrixL(); + + // Compute L_rect = A_rect * L_diag^{-T} + // Since we know what A_rect is we can find L_rect + // From A_rect = L_rect * L_diag^T + // and a triangular solve + const StorageIndex nsrow2 = nsrow - nscol; + if (nsrow2 > 0) { + auto S2 = Sfull.bottomRows(nsrow2); // (nsrow2 bu nscol) + + DenseMat S2T = S2.transpose(); + auto L1view = A_diag.template triangularView(); + L1view.solveInPlace(S2T); + S2 = S2T.transpose(); + } + + // Compute A_{off}^{updated} = A_{off} - L_{rect} L_{diag}^T + // This is the update for the next supernode + // So we queue this supernode as a child of its parent + if (nsrow2 > 0) { + Lpos[s] = nscol; // next time, child rows start below the diag block + const StorageIndex parent_s = supe.sn_id[Li[psi + nscol]]; + if (parent_s != kEmpty) { + next[s] = head[parent_s]; + head[parent_s] = s; + } + } + + // clear map entries we set for s + for (StorageIndex k = 0; k < nsrow; ++k) Map[Li[psi + k]] = kEmpty; + } + + return true; // yay it worked + } + + /** + * @brief Export the supernodal factor `L` into a sparse matrix in column-major CSC format. + * + * @param supe supernode structure + * @param Lpi per-supernode pointers into Li (size n_supernodes+1) + * @param Lpx per-supernode pointers into Lx (size n_supernodes+1) + * @param Li row indices per supernode (size = Lpi[n_supernodes]) + * @param Lx dense storage for all supernodes (size = Lpx[n_supernodes]) + * @param n order of the matrix + * @param drop_tol drop tolerance; remove values with abs() <= drop_tol + * @return CholMatrixType Eigen matrix + */ + static CholMatrixType export_sparse(const Supernodes& supe, const VectorI& Lpi, const VectorI& Lpx, const VectorI& Li, + const Matrix& Lx, StorageIndex n, + Scalar drop_tol = Scalar(0)) { + const StorageIndex n_supernodes = static_cast(supe.supernodes.size()) - 1; + + // count nnz(L) quickly: for a supernode with nscol by nsrow (lower form), + // column c (0..nscol-1) has (nsrow - c) entries. + size_t nnz = 0; + for (StorageIndex s = 0; s < n_supernodes; ++s) { + const StorageIndex nscol = supe.supernodes[s + 1] - supe.supernodes[s]; + const StorageIndex nsrow = supe.Snz[s]; + for (StorageIndex c = 0; c < nscol; ++c) nnz += size_t(nsrow - c); + } + + CholMatrixType L(n, n); + L.reserve(nnz); + + // emit columns in order with startVec/insertBack + for (StorageIndex s = 0; s < n_supernodes; ++s) { + const StorageIndex k1 = supe.supernodes[s]; + const StorageIndex k2 = supe.supernodes[s + 1]; + const StorageIndex nscol = k2 - k1; + const StorageIndex nsrow = supe.Snz[s]; + + /// start of row index list for s + const StorageIndex psi = Lpi[s]; + + /// start of dense block for s + const StorageIndex psx = Lpx[s]; + + // dense block is column-major with leading dimension nsrow + for (StorageIndex c = 0; c < nscol; ++c) { + const StorageIndex j = k1 + c; // global column in L + L.startVec(j); + + // lower part for this column within the supernode: + // local rows r = c..nsrow-1 map to global rows Li[psi + r] + const StorageIndex col_offset = psx + c * nsrow; + for (StorageIndex r = c; r < nsrow; ++r) { + const StorageIndex i = Li[psi + r]; // global row + const Scalar v = Lx[col_offset + r]; // value L(i,j) + + // drop exact & near zeros values + if (drop_tol == Scalar(0) ? (v != Scalar(0)) : (std::abs(v) > drop_tol)) { + // i >= j is guaranteed by construction + L.insertBack(i, j) = v; + } + } + } + } + + L.finalize(); + return L; + } +}; +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_SUPERNODAL_CHOLESKY_IMPL_H \ No newline at end of file