-
Notifications
You must be signed in to change notification settings - Fork 16
Add tuned HIP GiMMiK preload-C and width variants with non-temporal loads and stores #19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
tomjen12
wants to merge
15
commits into
PyFR:master
Choose a base branch
from
tomjen12:hip-gimmik-optimized
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 2 commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
8e23d63
Add tuned HIP GiMMiK preload variants
tomjen12 8f4d03e
Fix HIP GiMMiK block size metadata
tomjen12 96671a6
Address HIP GiMMiK review comments
tomjen12 739a82e
Handle ROCm feature suffixes for gfx942 tuning
tomjen12 0633539
Enable tuned HIP variants on gfx90a
tomjen12 7b59fb0
Parameterize HIP vector width and refine preload kernels
tomjen12 e9b921a
Use blockx launch bounds for HIP cstream preload
tomjen12 2aa2577
Always use non-temporal C accesses for HIP
tomjen12 be1c1db
feat(hip): add non-temporal B-load (NTB) variants for bstream-msplit
EricKing626 280e948
Use non-temporal B loads by default for HIP
tomjen12 c06216d
Make HIP preload-C a template option
tomjen12 e014e4d
Avoid HIP vector operator+= overloads
tomjen12 f6bc308
Prune HIP tuned variants to 12
tomjen12 a3aee45
Remove HIP variant arch gate
tomjen12 2c7af9b
Restore MI355 HIP baseline variants
tomjen12 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| <%inherit file='base'/> | ||
|
|
||
| <% | ||
| mx = partition(A, into=msplit, by='rows') | ||
| bchunks = chunk(bix, bsz) | ||
| %> | ||
|
|
||
| __global__ __launch_bounds__(${blockx*msplit}) void | ||
| % if n is None: | ||
| ${kname}(int n, | ||
| const ${dtype}* __restrict__ b, int ldb, | ||
| ${dtype}* __restrict__ c, int ldc) | ||
| { | ||
| % if width > 1: | ||
| n = ((n + ${width} - 1) / ${width}) * ${width}; | ||
| ldb /= ${width}; | ||
| ldc /= ${width}; | ||
| % endif | ||
| % else: | ||
| ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) | ||
| { | ||
| const int n = ${-(-n // width)}; | ||
| const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; | ||
| const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; | ||
| % endif | ||
| int i = blockDim.x*blockIdx.x + threadIdx.x; | ||
|
|
||
| ${dtype} bv, csub[${-(-m // msplit)}]; | ||
| __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; | ||
|
|
||
| ## Fill the initial shared memory block | ||
| % for cid in range(msplit): | ||
| if (i < n && threadIdx.y == ${cid}) | ||
| { | ||
| % for kx in bchunks[0]: | ||
| % if loop.index % msplit == cid: | ||
| bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; | ||
| % endif | ||
| % endfor | ||
|
|
||
| ## Preload C values for active rows owned by this m-split lane | ||
| % for j, jx in enumerate(mx[cid]): | ||
| % if afix[jx] != -1: | ||
| % if beta == 0: | ||
| csub[${j}] = make_zero(); | ||
| % elif beta == 1: | ||
| csub[${j}] = nt_load_c(&c[i + ${jx}*ldc]); | ||
| % else: | ||
| csub[${j}] = ${beta}*nt_load_c(&c[i + ${jx}*ldc]); | ||
| % endif | ||
| % endif | ||
| % endfor | ||
| } | ||
| % endfor | ||
| __syncthreads(); | ||
|
|
||
| ## Iterate over each row-chunk of B | ||
| % for bb in range(len(bchunks)): | ||
| ## Iterate over each row-chunk of C | ||
| % for cid, mcx in enumerate(mx): | ||
| if (i < n && threadIdx.y == ${cid}) | ||
| { | ||
| ## Start filling the next shared memory block | ||
| % if not loop.parent.last: | ||
| % for kx in bchunks[bb + 1]: | ||
| % if loop.index % msplit == cid: | ||
| bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; | ||
| % endif | ||
| % endfor | ||
| % endif | ||
| ## Accumulate our dot products | ||
| % for kx in bchunks[bb]: | ||
| bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; | ||
| % for j, jx in enumerate(A[mcx, kx]): | ||
| % if jx != 0: | ||
| csub[${j}] += ${jx}*bv; | ||
| % endif | ||
| ## If we're done with this dot product then store to global | ||
| % if kx == alix[mcx[j]]: | ||
| nt_store_c(&c[i + ${mcx[j]}*ldc], csub[${j}]); | ||
| % endif | ||
| % endfor | ||
| % endfor | ||
| ## Handle rows of A which are all zero | ||
| % if loop.parent.last: | ||
| % for j, jx in enumerate(afix): | ||
| % if jx == -1 and j % msplit == cid and beta == 0: | ||
| nt_store_c(&c[i + ${j}*ldc], make_zero()); | ||
| % elif jx == -1 and j % msplit == cid and beta != 1: | ||
| nt_store_c(&c[i + ${j}*ldc], ${beta}*nt_load_c(&c[i + ${j}*ldc])); | ||
| % endif | ||
| % endfor | ||
| % endif | ||
| } | ||
| % endfor | ||
| __syncthreads(); | ||
| % endfor | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we merge this into the msplit kernel which preload as an option using % if/else as appropriate to switch between the two?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I merged the preload-C path into the existing msplit template behind a
preloadoption, and removed the separate preload-C template.