su2code · digvijay-y · Jun 10, 2025 · Jun 10, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -537,6 +537,8 @@ class CConfig {
   Kind_TimeStep_Heat,           /*!< \brief Time stepping method for the (fvm) heat equation. */
   n_Datadriven_files;
 
+  ENUM_GRAPH_PART_ALGORITHM Kind_Graph_Part_Algo; /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
+
   DataDrivenFluid_ParsedOptions datadriven_ParsedOptions; /*!< \brief Options for data-driven fluid analysis. */
 
   STRUCT_TIME_INT Kind_TimeIntScheme_FEA;    /*!< \brief Time integration for the FEA equations. */
@@ -634,6 +636,8 @@ class CConfig {
   unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
   unsigned long Linear_Solver_Prec_Threads;      /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
   unsigned short Linear_Solver_ILU_n;            /*!< \brief ILU fill=in level. */
+  unsigned short Cuda_Block_Size;                /*!< \brief  User-specified value for the X-Axis dimension of thread blocks
+                                                              that are deployed by the CUDA Kernels. */
   su2double SemiSpan;                   /*!< \brief Wing Semi span. */
   su2double Roe_Kappa;                  /*!< \brief Relaxation of the Roe scheme. */
   su2double Relaxation_Factor_Adjoint;  /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4160,6 +4164,12 @@ class CConfig {
    */
   bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
 
+    /*!
+   * \brief Get the type of algorithm used for partitioning the matrix graph.
+   * \return Algorithm that divides the matrix into partitions that are executed parallely.
+   */
+  ENUM_GRAPH_PART_ALGORITHM GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+
   /*!
    * \brief Get the kind of solver for the implicit solver.
    * \return Numerical solver for implicit formulation (solving the linear system).
@@ -4221,6 +4231,12 @@ class CConfig {
    */
   su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
 
+  /*!
+   * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
+   * \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
+   */
+  unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
+
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */

diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -260,6 +260,13 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
+  unsigned long nColor;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                                     partitioning algorithms. */
+  unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
+  vector<unsigned long>
+      partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+  vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
+
   /*--- Data structures for point-to-point MPI communications. ---*/
 
   int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */

diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void DistributeColoring(const CConfig* config, CGeometry* geometry);
 
+  /*!
+   * \brief Divide the graph produced by the matrix into parallel partitions.
+   * \param[in] config - Definition of the particular problem.
+   * \param[in] pointList - Ordered list of points in the mesh.
+   */
+  template <class ScalarType>
+  void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
+
   /*!
    * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
    * \param[in] config - Definition of the particular problem.

diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -0,0 +1,179 @@
+/*!
+ * \file CGraphPartitioning.hpp
+ * \brief Headers for the classes realted to the algorithms that are used
+                to divide the matrix acyclic graph into parallel partitions.
+ * \author A. Raj
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../geometry/CGeometry.hpp"
+
+/*!
+ * \class CGraphPartitioning
+ * \brief Abstract base class for defining graph partitioning algorithms.
+ * \author A. Raj
+ *
+ * In order to use certain parallel algorithms in the solution process -
+ * whether with linear solvers or preconditioners - we require the matrix
+ * to be partitioned into certain parallel divisions. These maybe in the form
+ * of levels, blocks, colors and so on. Since a number of different algorithms
+ * can be used to split the graph, we've introduced a base class containing the
+ * "Partition" member function from which child classes of the specific
+ * algorithm can be derived. Currently, we are only using direct declarations
+ * of the derived classes in the code. However, this method was chosen as it
+ * allows us to pass different child class algorithms to a single implementation
+ * of the function that requires it - similar to the CMatrixVectorProduct class.
+ */
+
+template <class ScalarType>
+
+class CGraphPartitioning {
+ public:
+  virtual ~CGraphPartitioning() = 0;
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
+                         vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
+};
+template <class ScalarType>
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
+
+template <class ScalarType>
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
+ private:
+  ScalarType nPointDomain;
+  CPoint* nodes;
+
+ public:
+  ScalarType nLevels;
+  ScalarType maxLevelWidth;
+  vector<ScalarType> levels;
+
+  /*!
+   * \brief constructor of the class
+   * \param[in] nPointDomain_ref - number of points associated with the problem
+   * \param[in] nodes_ref - represents the relationships between the points
+   */
+  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+      : nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
+    nLevels = 0ul;
+    maxLevelWidth = 0ul;
+  }
+
+  /*!
+   * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
+   * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void CalculateChain(const vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
+    ScalarType levelWidth = 0;
+
+    /*This is not a magic number. We are simply initializing
+    the point array with its first element that is always zero.*/
+    chainPtr.push_back(0);
+
+    for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
+      levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
+      maxLevelWidth = std::max(levelWidth, maxLevelWidth);
+
+      if (levelWidth > rowsPerBlock) {
+        if (chainPtr.back() != iLevel) {
+          chainPtr.push_back(iLevel);
+        }
+
+        chainPtr.push_back(iLevel + 1);
+      }
+    }
+
+    chainPtr.push_back(nLevels);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[out] reorderedPointList - Reordered list of points after applying level scheduling.
+   */
+  void Reorder(const vector<ScalarType>& pointList, const vector<ScalarType>& levelOffsets,
+               vector<ScalarType>& reorderedPointList) {
+    auto levelOffsetsCursor = levelOffsets;
+
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+      reorderedPointList[levelOffsetsCursor[levels[localPoint]]++] = globalPoint;
+    }
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in,out] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
+                 unsigned short rowsPerBlock) override {
+    vector<ScalarType> inversePointList(nPointDomain);
+    levels.resize(nPointDomain, 0ul);
+
+    for (auto point = 0ul; point < nPointDomain; point++) {
+      inversePointList[pointList[point]] = point;
+    }
+
+    //  Local Point - Ordering of the points post the RCM ordering
+    //  Global Point - Original order of the points before the RCM ordering
+
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+
+      for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
+        const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
+
+        if (adjGlobalPoint < nPointDomain) {
+          const auto adjLocalPoint = inversePointList[adjGlobalPoint];
+
+          if (adjLocalPoint < localPoint) {
+            levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+          }
+        }
+      }
+
+      nLevels = std::max(nLevels, levels[localPoint] + 1);
+    }
+
+    levelOffsets.resize(nLevels + 1);
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      ++levelOffsets[levels[iPoint] + 1];
+    }
+
+    for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
+      levelOffsets[iLevel] += levelOffsets[iLevel - 1];
+    }
+
+    Reorder(pointList, levelOffsets, inversePointList);
+    pointList = std::move(inversePointList);
+
+    CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
+  }
+};
diff --git a/Common/include/linear_algebra/CLinearAlgebraUtils.hpp b/Common/include/linear_algebra/CLinearAlgebraUtils.hpp
@@ -0,0 +1,40 @@
+/*! 
+ * \file CLinearAlgebraUtils.hpp
+ * \brief Utility helpers for linear algebra modules.
+ * \author SU2 Contributors
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../parallelization/omp_structure.hpp"
+#include "../option_structure.hpp"
+
+namespace LinearAlgebraUtils {
+
+inline unsigned short ComputeRowsPerCudaBlock(unsigned short cudaBlockSize) {
+  return static_cast<unsigned short>(
+      roundUpDiv(static_cast<size_t>(cudaKernelParameters::CUDA_WARP_SIZE), static_cast<size_t>(cudaBlockSize)));
+}
+
+}  // namespace LinearAlgebraUtils
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,6 +205,12 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
+#ifdef HAVE_CUDA
+    if (config->GetCUDA()) {
+      sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
+      return;
+    }
+#endif
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
   }
 };

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,8 +148,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
+  const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  unsigned long* d_partition_offsets;
+  bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                   Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -904,8 +906,8 @@ class CSysMatrix {
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
-                                      const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.