Skip to content
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
79c9e52
updating branch
areenraj Jun 10, 2025
6686171
updating branch
areenraj Jun 10, 2025
b7591f4
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
6f51f45
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
cdf0225
resolving conflicts
areenraj Jun 25, 2025
9dd3028
resolving conflicts
areenraj Jun 25, 2025
2495edd
resolving some more conflicts
areenraj Jun 25, 2025
f68358e
cleaning up
areenraj Jun 25, 2025
bf561b6
apologies for repeated commits, just cleaning
areenraj Jun 25, 2025
9121e25
coalesced memory access for MVP, shared memory addition and lamda fun…
areenraj Jun 29, 2025
991e29f
bug fixes
areenraj Jul 1, 2025
9bfeff9
Merge remote-tracking branch 'upstream/master'
areenraj Jul 3, 2025
0372099
Working GPU LU_SGS Preconditioner Port
areenraj Jul 15, 2025
52b90b6
Fixed the issue with the visibility of the rowsPerBlock variable. Als…
areenraj Jul 17, 2025
d367627
Working LU_SGS Preconditioner with graph partitioned algorithms, upda…
areenraj Jul 17, 2025
661c9b8
LU_SGS Preconditioner Port
areenraj Jul 17, 2025
b5cf7dd
Merge branch 'master' of https://github.com/areenraj/SU2_GSoC_GPU
areenraj Jul 17, 2025
c4dbe5c
Fixing warnings
areenraj Jul 17, 2025
b3d2fbf
Merge branch 'develop' of https://github.com/su2code/SU2
areenraj Jul 17, 2025
1be1e2f
Syncing repo to develop
areenraj Jul 17, 2025
7472bd1
updating submodule versions
areenraj Jul 17, 2025
352e148
Fixing some more warnings
areenraj Jul 17, 2025
c3b8062
Addresed changes in PR 2539
digvijay-y Mar 8, 2026
64df92e
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 8, 2026
340d2f9
WIP: local changes before merge
digvijay-y Mar 8, 2026
791a03e
Merge branch 'gpu-lusgs' of https://github.com/digvijay-y/SU2 into gp…
digvijay-y Mar 8, 2026
4ac7e97
ncolor -> nGraphPartition
digvijay-y Mar 8, 2026
b808fdc
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 9, 2026
b0bde3f
Merge branch 'develop' into gpu-lusgs
digvijay-y Mar 11, 2026
8cfb381
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 7, 2026
6001d61
Merge branch 'develop' into gpu-lusgs
digvijay-y Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Common/include/CConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ class CConfig {
Kind_TimeStep_Heat, /*!< \brief Time stepping method for the (fvm) heat equation. */
n_Datadriven_files;

ENUM_GRAPH_PART_ALGORITHM Kind_Graph_Part_Algo; /*!< \brief Algorithm for parallel partitioning of the matrix graph. */

DataDrivenFluid_ParsedOptions datadriven_ParsedOptions; /*!< \brief Options for data-driven fluid analysis. */

STRUCT_TIME_INT Kind_TimeIntScheme_FEA; /*!< \brief Time integration for the FEA equations. */
Expand Down Expand Up @@ -634,6 +636,8 @@ class CConfig {
unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */
unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks
that are deployed by the CUDA Kernels. */
su2double SemiSpan; /*!< \brief Wing Semi span. */
su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */
su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
Expand Down Expand Up @@ -4160,6 +4164,12 @@ class CConfig {
*/
bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }

/*!
* \brief Get the type of algorithm used for partitioning the matrix graph.
* \return Algorithm that divides the matrix into partitions that are executed parallely.
*/
ENUM_GRAPH_PART_ALGORITHM GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }

/*!
* \brief Get the kind of solver for the implicit solver.
* \return Numerical solver for implicit formulation (solving the linear system).
Expand Down Expand Up @@ -4221,6 +4231,12 @@ class CConfig {
*/
su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }

/*!
* \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
* \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
*/
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }

/*!
* \brief Get the relaxation factor for solution updates of adjoint solvers.
*/
Expand Down
7 changes: 7 additions & 0 deletions Common/include/geometry/CGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ class CGeometry {
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
on all prior ranks in the linear partitioning. */

unsigned long nColor; /*!< \brief Number of divisions of the matrix graph during execution of parallel
Comment thread
digvijay-y marked this conversation as resolved.
Outdated
partitioning algorithms. */
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
vector<unsigned long>
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */

/*--- Data structures for point-to-point MPI communications. ---*/

int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
Expand Down
8 changes: 8 additions & 0 deletions Common/include/geometry/CPhysicalGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
*/
void DistributeColoring(const CConfig* config, CGeometry* geometry);

/*!
* \brief Divide the graph produced by the matrix into parallel partitions.
* \param[in] config - Definition of the particular problem.
* \param[in] pointList - Ordered list of points in the mesh.
*/
template <class ScalarType>
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);

/*!
* \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
* \param[in] config - Definition of the particular problem.
Expand Down
179 changes: 179 additions & 0 deletions Common/include/linear_algebra/CGraphPartitioning.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*!
* \file CGraphPartitioning.hpp
* \brief Headers for the classes realted to the algorithms that are used
to divide the matrix acyclic graph into parallel partitions.
* \author A. Raj
* \version 8.2.0 "Harrier"
*
* SU2 Project Website: https://su2code.github.io
*
* The SU2 Project is maintained by the SU2 Foundation
* (http://su2foundation.org)
*
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
*
* SU2 is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* SU2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../geometry/CGeometry.hpp"

/*!
* \class CGraphPartitioning
* \brief Abstract base class for defining graph partitioning algorithms.
* \author A. Raj
*
* In order to use certain parallel algorithms in the solution process -
* whether with linear solvers or preconditioners - we require the matrix
* to be partitioned into certain parallel divisions. These maybe in the form
* of levels, blocks, colors and so on. Since a number of different algorithms
* can be used to split the graph, we've introduced a base class containing the
* "Partition" member function from which child classes of the specific
* algorithm can be derived. Currently, we are only using direct declarations
* of the derived classes in the code. However, this method was chosen as it
* allows us to pass different child class algorithms to a single implementation
* of the function that requires it - similar to the CMatrixVectorProduct class.
*/

template <class ScalarType>

class CGraphPartitioning {
public:
virtual ~CGraphPartitioning() = 0;
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
};
template <class ScalarType>
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}

template <class ScalarType>
class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
private:
ScalarType nPointDomain;
CPoint* nodes;

public:
ScalarType nLevels;
ScalarType maxLevelWidth;
vector<ScalarType> levels;

/*!
* \brief constructor of the class
* \param[in] nPointDomain_ref - number of points associated with the problem
* \param[in] nodes_ref - represents the relationships between the points
*/
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
: nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
nLevels = 0ul;
maxLevelWidth = 0ul;
}

/*!
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
*/
void CalculateChain(const vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
ScalarType levelWidth = 0;

/*This is not a magic number. We are simply initializing
the point array with its first element that is always zero.*/
chainPtr.push_back(0);

for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
maxLevelWidth = std::max(levelWidth, maxLevelWidth);

if (levelWidth > rowsPerBlock) {
if (chainPtr.back() != iLevel) {
chainPtr.push_back(iLevel);
}

chainPtr.push_back(iLevel + 1);
}
}

chainPtr.push_back(nLevels);
}

/*!
* \brief Reorders the points according to the levels
* \param[in] pointList - Ordered array that contains the list of all mesh points.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
* \param[out] reorderedPointList - Reordered list of points after applying level scheduling.
*/
void Reorder(const vector<ScalarType>& pointList, const vector<ScalarType>& levelOffsets,
vector<ScalarType>& reorderedPointList) {
auto levelOffsetsCursor = levelOffsets;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason for using a buffer variable? (Interesting variable name btw)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of the moving variable, reorder increments per-level write positions, but original level offsets are kept unchanged for downstream

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will have to re-read my code because I dont remember what I wrote a year ago. But won't this affect the calculations down the line? Did you find any difference in the values while comparing it to the CPU implementation?


for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];
reorderedPointList[levelOffsetsCursor[levels[localPoint]]++] = globalPoint;
}
}

/*!
* \brief Reorders the points according to the levels
* \param[in,out] pointList - Ordered array that contains the list of all mesh points.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
*/
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
unsigned short rowsPerBlock) override {
vector<ScalarType> inversePointList(nPointDomain);
levels.resize(nPointDomain, 0ul);

for (auto point = 0ul; point < nPointDomain; point++) {
inversePointList[pointList[point]] = point;
}

// Local Point - Ordering of the points post the RCM ordering
// Global Point - Original order of the points before the RCM ordering

for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];

for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);

if (adjGlobalPoint < nPointDomain) {
const auto adjLocalPoint = inversePointList[adjGlobalPoint];

if (adjLocalPoint < localPoint) {
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
}
}
}

nLevels = std::max(nLevels, levels[localPoint] + 1);
}

levelOffsets.resize(nLevels + 1);
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
++levelOffsets[levels[iPoint] + 1];
}

for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
}

Reorder(pointList, levelOffsets, inversePointList);
pointList = std::move(inversePointList);

CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
}
};
40 changes: 40 additions & 0 deletions Common/include/linear_algebra/CLinearAlgebraUtils.hpp
Comment thread
digvijay-y marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*!
* \file CLinearAlgebraUtils.hpp
* \brief Utility helpers for linear algebra modules.
* \author SU2 Contributors
* \version 8.2.0 "Harrier"
*
* SU2 Project Website: https://su2code.github.io
*
* The SU2 Project is maintained by the SU2 Foundation
* (http://su2foundation.org)
*
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
*
* SU2 is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* SU2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../parallelization/omp_structure.hpp"
#include "../option_structure.hpp"

namespace LinearAlgebraUtils {

inline unsigned short ComputeRowsPerCudaBlock(unsigned short cudaBlockSize) {
return static_cast<unsigned short>(
roundUpDiv(static_cast<size_t>(cudaKernelParameters::CUDA_WARP_SIZE), static_cast<size_t>(cudaBlockSize)));
}

} // namespace LinearAlgebraUtils
6 changes: 6 additions & 0 deletions Common/include/linear_algebra/CPreconditioner.hpp
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good. Did you test this once with the flag on or off, just to make sure?

Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
* \param[out] v - CSysVector that is the result of the preconditioning.
*/
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
#ifdef HAVE_CUDA
if (config->GetCUDA()) {
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
return;
}
#endif
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
}
};
Expand Down
10 changes: 6 additions & 4 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,10 @@ class CSysMatrix {
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
unsigned long* d_partition_offsets;
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -904,8 +906,8 @@ class CSysMatrix {
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
* \param[out] prod - Result of the product A*vec.
*/
void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
const CConfig* config) const;
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the Jacobi preconditioner.
Expand Down
Loading