From 2f09b6dc83c913c1610c1c5ebcf76d03074d1d9b Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Mon, 20 Apr 2026 16:54:53 -0500 Subject: [PATCH 01/28] Add third party submodiles --- .gitmodules | 8 ++++++++ third-party/ELFIO | 1 + third-party/msgpack-c | 1 + 3 files changed, 10 insertions(+) create mode 100644 .gitmodules create mode 160000 third-party/ELFIO create mode 160000 third-party/msgpack-c diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..2ddae89 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,8 @@ +[submodule "third-party/msgpack-c"] + path = third-party/msgpack-c + url = https://github.com/msgpack/msgpack-c.git + branch = cpp_master +[submodule "third-party/ELFIO"] + path = third-party/ELFIO + url = https://github.com/serge1/ELFIO.git + branch = main diff --git a/third-party/ELFIO b/third-party/ELFIO new file mode 160000 index 0000000..94f7706 --- /dev/null +++ b/third-party/ELFIO @@ -0,0 +1 @@ +Subproject commit 94f7706b5325b2ad9872e4481278278592cf86c9 diff --git a/third-party/msgpack-c b/third-party/msgpack-c new file mode 160000 index 0000000..092bc69 --- /dev/null +++ b/third-party/msgpack-c @@ -0,0 +1 @@ +Subproject commit 092bc69b6e815980bce7808595c914dd3a29f905 From ca1dc947c7d604437a7c0cb679efd32767e69d0f Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Mon, 20 Apr 2026 16:55:36 -0500 Subject: [PATCH 02/28] Dump source code --- CMakeLists.txt | 73 ++++++++ extract-fatbin.cpp | 44 +++++ extract-gpubin.cpp | 88 +++++++++ instr-driver | 81 ++++++++ preload.cpp | 273 +++++++++++++++++++++++++++ update-exec.cpp | 428 +++++++++++++++++++++++++++++++++++++++++++ update-fatbin.cpp | 255 ++++++++++++++++++++++++++ update-note-phdr.cpp | 91 +++++++++ update-note.cpp | 258 ++++++++++++++++++++++++++ 9 files changed, 1591 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 extract-fatbin.cpp create mode 100644 extract-gpubin.cpp create mode 100755 instr-driver create mode 100644 preload.cpp create mode 100644 update-exec.cpp create mode 100644 update-fatbin.cpp create mode 100644 update-note-phdr.cpp create mode 100644 update-note.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3ab139c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 3.20) + +set(ROCM_PATH + "" + CACHE PATH "Path to ROCm install directory") + +if("${ROCM_PATH}" STREQUAL "") + message( + FATAL_ERROR + "\n ROCM_PATH not set" + "Please provide it using: cmake -DROCM_PATH=/path/to/rocm/install \n") +endif() + +set(CMAKE_C_COMPILER "${ROCM_PATH}/bin/amdclang") +set(CMAKE_CXX_COMPILER "${ROCM_PATH}/bin/amdclang++") + +project(amdgpu-tooling LANGUAGES CXX) +# ALL REGULAR TOOLS + +add_executable(update-note update-note.cpp) +target_include_directories( + update-note PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/msgpack/include) + +add_executable(update-note-phdr update-note-phdr.cpp) +target_include_directories( + update-note-phdr PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + +add_executable(extract-fatbin extract-fatbin.cpp) +target_include_directories( + extract-fatbin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + +add_executable(extract-gpubin extract-gpubin.cpp) + +add_executable(update-fatbin update-fatbin.cpp) + +add_executable(update-exec update-exec.cpp) +target_include_directories( + update-exec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + +# SPECIAL CASE FOR PRELOAD + +# Paths for hipcc and the preload file +set(HIPCC "${ROCM_PATH}/bin/hipcc") +set(PRELOAD_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/preload.cpp") +set(PRELOAD_SO "${CMAKE_CURRENT_BINARY_DIR}/preload.so") + +# Actual command to build preload.so +add_custom_command( + OUTPUT "${PRELOAD_SO}" + COMMAND ${HIPCC} -D__HIP_PLATFORM_AMD__ -x c++ -shared -fpic + -I/opt/rocm-6.0.0/include/ "${PRELOAD_SOURCE}" -o "${PRELOAD_SO}" + DEPENDS "${PRELOAD_SOURCE}" + COMMENT "Building ${PRELOAD_SOURCE} with ${HIPCC}" + VERBATIM) + +add_custom_target(PreloadFile ALL DEPENDS "${PRELOAD_SO}") + +# SPECIAL CASE FOR instr-driver + +# Paths for instr-driver +set(DRIVER_SCRIPT_SRC "${CMAKE_CURRENT_SOURCE_DIR}/instr-driver") +set(DRIVER_SCRIPT_DEST "${CMAKE_CURRENT_BINARY_DIR}/instr-driver") + +# Command to copy instr-driver +add_custom_command( + OUTPUT "${DRIVER_SCRIPT_DEST}" + COMMAND ${CMAKE_COMMAND} -E copy "${DRIVER_SCRIPT_SRC}" + "${DRIVER_SCRIPT_DEST}" + DEPENDS "${DRIVER_SCRIPT_SRC}" + COMMENT "Copying ${DRIVER_SCRIPT_SRC} to build directory" + VERBATIM) + +add_custom_target(DriverScript ALL DEPENDS "${DRIVER_SCRIPT_DEST}") diff --git a/extract-fatbin.cpp b/extract-fatbin.cpp new file mode 100644 index 0000000..654ef1e --- /dev/null +++ b/extract-fatbin.cpp @@ -0,0 +1,44 @@ +#include +#include +#include + +#include "elfio/elfio.hpp" + +static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { + for (int i = 0; i < file.sections.size(); ++i) { + if (file.sections[i]->get_name() == sectionName) + return file.sections[i]; + } + return nullptr; +} + +static ELFIO::section *getFatbinSection(const ELFIO::elfio &file) { + return getSection(".hip_fatbin", file); +} + +int main(int argc, char **argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + ELFIO::elfio execFile; + if (!execFile.load(argv[1])) { + std::cout << "can't find or process ELF file " << argv[1] << '\n'; + exit(1); + } + + ELFIO::section *fatbinSection = getFatbinSection(execFile); + if (!fatbinSection) { + std::cout << ".hip_fatbin section not found in " << argv[1] << "\n"; + exit(1); + } + + // Write fatbin to a separate file + std::ofstream fatbinFile(std::string(argv[1]) + ".fatbin", std::ios::out | std::ios::binary); + + fatbinFile.write(fatbinSection->get_data(), fatbinSection->get_size()); + fatbinFile.close(); + + return 0; +} diff --git a/extract-gpubin.cpp b/extract-gpubin.cpp new file mode 100644 index 0000000..c06dfd2 --- /dev/null +++ b/extract-gpubin.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include + +void showHelp(const std::string &toolName) { + std::cerr << "Usage : " << toolName << " " + << "" << std::endl; + std::cerr << "supported architectures : gfx900, gfx906, gfx908, gfx90a, gfx940" << std::endl; +} + +int main(int argc, char *argv[]) { + if (argc != 3) { + showHelp(argv[0]); + exit(1); + } + + std::string arch(argv[1]); + std::string fatbinPath(argv[2]); + + std::ifstream fatbin(fatbinPath, std::ios::binary); + if (!fatbin) { + std::cerr << "error : can't open " << fatbinPath << std::endl; + exit(1); + } + + char buffer[24 + 1]; + fatbin.read(buffer, 24); + buffer[24] = 0; + + assert(std::string(buffer) == "__CLANG_OFFLOAD_BUNDLE__"); + + uint64_t numBundleEntries = 0; + fatbin.read(reinterpret_cast(&numBundleEntries), sizeof(numBundleEntries)); + + uint64_t elfStart = 0; + uint64_t elfSize = 0; + bool found = false; + + // Read metadata for each elf object in this bundle + while (numBundleEntries) { + uint64_t bundleEntryCodeObjectOffset; // offset from begining of the fatbin + fatbin.read(reinterpret_cast(&bundleEntryCodeObjectOffset), + sizeof(bundleEntryCodeObjectOffset)); + + uint64_t size; + fatbin.read(reinterpret_cast(&size), sizeof(size)); + + uint64_t idLength; + fatbin.read(reinterpret_cast(&idLength), sizeof(idLength)); + + char id[idLength]; + fatbin.read(id, idLength); + + std::string idString(id); + // If idString ends with arch + if (idString.substr(idLength - arch.length()) == arch) { + elfStart = bundleEntryCodeObjectOffset; + elfSize = size; + found = true; + } + numBundleEntries--; + } + + if (!found) { + std::cerr << fatbinPath << " doesn't contain a " << arch << " binary\n"; + exit(0); + } + + // std::cout << arch << ' ' << "ELF at " << elfStart << " of size " << elfSize << '\n'; + + fatbin.seekg(elfStart, std::ios::beg); + char data[elfSize]; + fatbin.read(data, elfSize); + + std::string elfBinPath(fatbinPath + "." + arch); + std::ofstream elfBin(elfBinPath, std::ios::binary); + + if (!elfBin) { + std::cerr << "error : can't create " << elfBinPath << std::endl; + exit(1); + } + + elfBin.write(data, elfSize); + elfBin.close(); + + fatbin.close(); +} diff --git a/instr-driver b/instr-driver new file mode 100755 index 0000000..78ee59f --- /dev/null +++ b/instr-driver @@ -0,0 +1,81 @@ +#!/bin/bash + +# TODO : Make this script more like a CLI tool + +MUTATOR=$1 + +EXEC_IN=$2 +EXEC_UPDATED=$EXEC_IN.updated +EXEC_RENAMED1=$EXEC_UPDATED.renamed1 +EXEC_RENAMED2=$EXEC_UPDATED.renamed2 +EXEC_OUT=$EXEC_IN.out + +FATBIN=$EXEC_IN.fatbin +FATBIN_UPDATED=$FATBIN.updated + +GPUBIN=$FATBIN.gfx908 +GPUBIN_INSTR=$GPUBIN-instr +GPUBIN_UPDATED=$GPUBIN_INSTR +GPUBIN_UPDATED_NOTE=$GPUBIN_INSTR.updated-note +GPUBIN_FINAL=$GPUBIN.final + +# Contains names of instrumented kernels +NAMES_FILE=$GPUBIN.instrumentedKernelNames + +NOTE_IN=$GPUBIN.note +NOTE_OUT=$NOTE_IN.expanded + +# 1. Extract fatbin. This will output a $FATBIN +extract-fatbin $EXEC_IN + +# 2. Extract gfx908 bin. This will output $GPUBIN +extract-gpubin gfx908 $FATBIN + +# 3. Run the mutator, instrument kernels (also use the information from step 3). +# This will also emit a file containing list of instrumented kernels ($NAMES_FILE) +# $MUTATOR -procedure-count $GPUBIN +$MUTATOR $GPUBIN + +# 4. Update kernel descriptors for instrumented kernels +# This will produce $GPUBIN_UPDATED +# update-kd $NAMES_FILE $GPUBIN_INSTR + +# 5. Modify the note metadata +# +# 5.1 Extract the note section from original binary +llvm-objcopy --dump-section=.note=$NOTE_IN $GPUBIN + +# 5.2 For each instrumented kernel, modify the metadata as follows: +# - Expand the kernarg buffer with 1 additional argument, which the additional memory that we will allocate via the host. +# - Increase SGPR usage to 112 (GFX908 only for now) +# This will emit $NOTE_OUT. +update-note $NAMES_FILE $NOTE_IN + +# 5.3 Copy the updated binary, remove the note section +cp $GPUBIN_INSTR $GPUBIN_UPDATED_NOTE +llvm-objcopy --remove-section=.note $GPUBIN_UPDATED_NOTE + +# 5.4 Add the expanded note section +llvm-objcopy --add-section=.note=$NOTE_OUT $GPUBIN_UPDATED_NOTE + +# 5.5 Update the program header for the notes section +update-note-phdr $GPUBIN_UPDATED_NOTE $GPUBIN_FINAL + +# 6. Update original fatbin with instrumented gpu binary ($GPUBIN_FINAL) +# This will emit $FATBIN_UPDATED +update-fatbin gfx908 $GPUBIN_FINAL $FATBIN + +# 7. Update the original executable ($EXEC_IN) by embedding $FATBIN_UPDATED +# This will emit $EXEC_UPDATED +update-exec $EXEC_IN $FATBIN_UPDATED $EXEC_UPDATED + +# 8. Rename fatbin sections so that roc-obj* tools work with the modified executable. Those tools specifically look for the .hip_fatbin section by name. +# - Rename .hip_fatbin section to .old_fatbin +# - Rename .new_fatbin section to .hip_fatbin +# It is possible to do this within the update-exec tool, but doing it here is simpler and less error-prone +llvm-objcopy --rename-section .hip_fatbin=.old_fatbin $EXEC_UPDATED $EXEC_RENAMED1 +llvm-objcopy --rename-section .new_fatbin=.hip_fatbin $EXEC_RENAMED1 $EXEC_RENAMED2 + +cp $EXEC_RENAMED2 $EXEC_OUT + +chmod +x $EXEC_OUT diff --git a/preload.cpp b/preload.cpp new file mode 100644 index 0000000..5b0b034 --- /dev/null +++ b/preload.cpp @@ -0,0 +1,273 @@ +#include "hip/hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Environment variable for the instrumentation variable table path: +const char *instrumentationVariableTableEnv = "DYNINST_AMDGPU_INSTRUMENTATON_VAR_TABLE"; + +// Environment variable for the instrumented kernel names path: +const char *instrumentedKernelNamesEnv = "DYNINST_AMDGPU_INSTRUMENTED_KERNEL_NAMES"; + +// This will be used to print the names and values of the instrumentation variables after the kernel launch is done and the instrumentation variables are copied back. +struct InstrumentationVarTableEntry { + int offset; + std::string name; + // TODO : This needs a size field too + + InstrumentationVarTableEntry(std::vector words) { + assert(words.size() == 2); + offset = std::stoi(words[0]); + name = words[1]; + } +}; + +std::unordered_map &getKernargSizeMap() { + static std::unordered_map instance; + return instance; +} + +std::unordered_map &getFirstHiddenArgIndexMap() { + static std::unordered_map instance; + return instance; +} + +std::vector &getInstrumentationVarTableEntries() { + static std::vector instance; + return instance; +} + +// Read words from a string +void getWords(const std::string &str, std::vector &words) { + std::stringstream ss(str); + std::string word; + while (ss >> word) { + words.push_back(word); + } +} + +// The code here is to retrieve the map : +// offset -> instrumentation variable name + +// The table is sorted by offset +void readInstrumentedVarTable(const std::string &filePath) { + auto &tableEntries = getInstrumentationVarTableEntries(); + std::ifstream tableFile(filePath); + std::string line; + + assert(tableFile.is_open()); + + std::vector words; + + while (std::getline(tableFile, line)) { + getWords(line, words); + InstrumentationVarTableEntry tableEntry(words); + tableEntries.push_back(tableEntry); + words.clear(); + } + + tableFile.close(); +} + +// This is used to retrieve the maps: +// kernelName -> kernargBufferSize +// kernelName -> firstHiddenArgIndex +// +// We extend the kernel signature to take an additional argument, which is the memory holding +// instrumentation variables. The map will be used to update the kernarg signature with a +// bigger kernarg buffer size, to accomodate for the additional argument. +void readPreloadInfo(const std::string &filePath) { + auto &kernargSizeMap = getKernargSizeMap(); + auto &firstHiddenArgIndexMap = getFirstHiddenArgIndexMap(); + + std::cerr << "readPreloadMaps : reading " << filePath << "\n"; + std::ifstream mapFile(filePath); + std::string line; + + assert(mapFile.is_open()); + + std::vector words; + + + std::cerr << " size : " << kernargSizeMap.size() << '\n'; + for(auto it : kernargSizeMap) { + std::cerr << it.first << ' ' << it.second << '\n'; + } + + while (std::getline(mapFile, line)) { + getWords(line, words); + std::cerr << words.size() << '\n'; + assert(words.size() == 3); // ( ) + + std::string kernelName = words[0]; + + int kernargSize = std::stoi(words[1]); + kernargSizeMap[kernelName] = kernargSize; + + int firstHiddenArgIndex = std::stoi(words[2]); + firstHiddenArgIndexMap[kernelName] = firstHiddenArgIndex; + + words.clear(); + } + mapFile.close(); +} + +typedef void (*registerFunc_t ) ( + void** modules, + const void* hostFunction, + char* deviceFunction, + const char* deviceName, + unsigned int threadLimit, + uint3* tid, + uint3* bid, + dim3* blockDim, + dim3* gridDim, + int* wSize); + +static registerFunc_t realRegisterFunction; + +static std::unordered_map addressToKernelName; + +extern "C" void __hipRegisterFunction( + void** modules, + const void* hostFunction, + char* deviceFunction, + const char* deviceName, + unsigned int threadLimit, + uint3* tid, + uint3* bid, + dim3* blockDim, + dim3* gridDim, + int* wSize) { + + if(realRegisterFunction == 0) { + realRegisterFunction = (registerFunc_t) dlsym(RTLD_NEXT,"__hipRegisterFunction"); + // Map address to kernel name + addressToKernelName[hostFunction] = std::string(deviceFunction); + } + realRegisterFunction(modules,hostFunction,deviceFunction,deviceName,threadLimit,tid,bid,blockDim,gridDim,wSize); + return; +} + +typedef uint32_t (*launch_t)(const void *hostFunction, dim3 gridDim, + dim3 blockDim, void **args, size_t sharedMemBytes, + hipStream_t stream); +launch_t realLaunch = 0; + +extern "C" hipError_t hipLaunchKernel(const void *hostFunction, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMemBytes, + hipStream_t stream) { + + if (realLaunch == 0) { + realLaunch = (launch_t)dlsym(RTLD_NEXT, "hipLaunchKernel"); + } + assert(realLaunch != 0); + + auto &kernargSizeMap = getKernargSizeMap(); + auto &instrumentationVarTableEntries = getInstrumentationVarTableEntries(); + + // Step 0. Get kernel name + auto iter = addressToKernelName.find(hostFunction); + if (iter == addressToKernelName.end()) { + std::cerr << "ERROR : kernel being launched wasn't registered by hipRegisterFunction\n" + << "Doing regular launch..."; + + realLaunch(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); + + // host code might unnecessarily get triggered if this isn't hipSuccess + return hipSuccess; + } + + std::string kernelName = iter->second; + + // Step 1. Check whether this is an instrumented kernel, i.e it should be in kernargSizeMapPath. + // If not instrumented, just launch it. + auto it = kernargSizeMap.find(kernelName); + int kernargSize = it->second; + if (it == kernargSizeMap.end()) { + // Do regular launch + std::cerr << kernelName << " is not instrumented. Doing regular launch\n"; + realLaunch(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); + return hipSuccess; + } + + // Step 2. Get size of instrumentation memory + // TODO: Use size + assert(!instrumentationVarTableEntries.empty()); + InstrumentationVarTableEntry lastEntry = *(instrumentationVarTableEntries.end() - 1); + size_t allocSize = lastEntry.offset + 4; + unsigned *instrumentationDataHost = (unsigned *)calloc(1, allocSize); + + std::cerr << '\n'; + + unsigned *instrumentationDataDevice; + + hipError_t hip_ret = hipMalloc((void **)&instrumentationDataDevice, allocSize); + assert(hip_ret == hipSuccess); + + hip_ret = hipMemset(instrumentationDataDevice, 0, allocSize); + + assert(hip_ret == hipSuccess); + + int newKernargSize = kernargSize + sizeof(void *); + void **newArgs = (void **)malloc(newKernargSize); + + memcpy(newArgs, args, kernargSize); + + int newArgIndex = getFirstHiddenArgIndexMap()[kernelName]; + newArgs[newArgIndex] = (void *)(&instrumentationDataDevice); + + std::cerr << "Launching instrumented kernel : " << kernelName << '\n'; + + auto start = std::chrono::high_resolution_clock::now(); + + realLaunch(hostFunction, gridDim, blockDim, newArgs, sharedMemBytes, stream); + assert(hipStreamSynchronize(stream) == hipSuccess); + + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration elapsed = end - start; + std::cout << "Runtime : " << elapsed.count() << " ms\n"; + + std::cerr << "Kernel execution complete. Copying instrumentation variables to host...\n"; + + hip_ret = hipMemcpy(instrumentationDataHost, instrumentationDataDevice, /* size = */ allocSize, + hipMemcpyDeviceToHost); + assert(hip_ret == hipSuccess); + + std::cerr << "Done.\n"; + std::cerr << "Instrumentation variable values: \n"; + for (auto entry : instrumentationVarTableEntries) { + std::cerr << entry.name << " = " << instrumentationDataHost[entry.offset / 4] << '\n'; + } + std::cerr << '\n'; + return hipSuccess; +} + +__attribute__((constructor)) void setup(void) { + realLaunch = 0; + + const char *kernargSizeMapPath = getenv(instrumentedKernelNamesEnv); + if (!kernargSizeMapPath) { + std::cerr << "LD_PRELOAD setup: " << instrumentedKernelNamesEnv << " not defined\n"; + exit(1); + } + + readPreloadInfo(kernargSizeMapPath); + + const char *tableFilePath = getenv(instrumentationVariableTableEnv); + if (!tableFilePath) { + std::cerr << "LD_PRELOAD setup: " << instrumentationVariableTableEnv << " not defined\n"; + exit(1); + } + readInstrumentedVarTable(tableFilePath); +} diff --git a/update-exec.cpp b/update-exec.cpp new file mode 100644 index 0000000..eaff109 --- /dev/null +++ b/update-exec.cpp @@ -0,0 +1,428 @@ +#include "elfio/elfio.hpp" + +#include +#include +#include +#include +#include + +// This tool creates a clone of the original executable, adds the new fatbin +// to the clone, and later patches the clone so that the Linux kernel loader can +// see the program headers. +// +// usage: +// update-exec + +// These maps are for correcting the section links in the clone. +std::unordered_map ogToNewSectionMap; +std::unordered_map newToOgSectionMap; + +static void showHelp(const char *toolName) { + std::cout << "usage : \n"; + std::cout << " "; + std::cout << toolName << " \n\n"; + std::cout << toolName << " will emit containing the \n"; +} + +static void dumpSection(const ELFIO::section *section, bool printContents = true) { + assert(section && "section must be non-null"); + + std::cout << "section : " << section->get_name() << ", "; + std::cout << "size : " << section->get_size() << ", "; + std::cout << "offset : " << section->get_offset() << ", "; + std::cout << "addr-align : " << section->get_addr_align() << ", "; + std::cout << "entry-size : " << section->get_entry_size() << '\n'; + + if (!printContents) + return; + + std::cout << "section contents :\n"; + + std::cout << std::hex; + for (int i = 0; i < section->get_size(); ++i) { + std::cout << (unsigned)section->get_data()[i] << ' '; + } + std::cout << std::dec << '\n'; +} + +// === SECTION-GETTING HELPERS BEGIN === +// +ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { + for (int i = 0; i < file.sections.size(); ++i) { + if (file.sections[i]->get_name() == sectionName) + return file.sections[i]; + } + return nullptr; +} + +ELFIO::section *getFatbinSection(const ELFIO::elfio &file) { + return getSection(".hip_fatbin", file); +} + +ELFIO::section *getFatbinWrapperSection(const ELFIO::elfio &file) { + return getSection(".hipFatBinSegment", file); +} +// +// === SECTION-GETTING HELPERS END === + +static size_t getFileSize(const std::string &filePath) { + std::ifstream file(filePath); + assert(file.is_open()); + + file.seekg(0, std::ifstream::end); + + std::streampos pos = file.tellg(); + std::streamoff offset = pos - std::streampos(0); + uint64_t size = static_cast(offset); + + file.close(); + return size; +} + +ELFIO::segment *getPtLoad1(const ELFIO::elfio &file) { + for (int i = 0; i < file.segments.size(); ++i) { + auto segment = file.segments[i]; + if (segment->get_type() == ELFIO::PT_LOAD) + return segment; + } + return nullptr; +} + +ELFIO::segment *getPhdrSegment(const ELFIO::elfio &file) { + size_t entryPoint = file.get_entry(); + for (int i = 0; i < file.segments.size(); ++i) { + auto segment = file.segments[i]; + if (segment->get_type() == ELFIO::PT_PHDR) + return segment; + } + return nullptr; +} + +ELFIO::segment *getLastSegment(const ELFIO::elfio &execFile) { + const size_t numSegments = execFile.segments.size(); + assert(numSegments != 0); + + ELFIO::segment *lastSegment = execFile.segments[0]; + for (size_t i = 0; i < numSegments; ++i) { + ELFIO::segment *currSegment = execFile.segments[i]; + size_t currSegmentBegin = currSegment->get_virtual_address(); + size_t currSegmentSize = currSegment->get_memory_size(); + size_t currSegmentEnd = currSegmentBegin + currSegmentSize; + + size_t lastSegmentBegin = lastSegment->get_virtual_address(); + size_t lastSegmentSize = lastSegment->get_memory_size(); + size_t lastSegmentEnd = lastSegmentBegin + lastSegmentSize; + + if (currSegmentEnd > lastSegmentEnd) { + lastSegment = currSegment; + } + } + + return lastSegment; +} + +void cloneHeader(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { + newExec.create(ogExec.get_class(), ogExec.get_encoding()); + newExec.set_os_abi(ogExec.get_os_abi()); + newExec.set_abi_version(ogExec.get_abi_version()); + newExec.set_type(ogExec.get_type()); + newExec.set_machine(ogExec.get_machine()); + newExec.set_entry(ogExec.get_entry()); +} + +bool shouldClone(const ELFIO::section *section) { + switch (section->get_type()) { + case ELFIO::SHT_NULL: + return false; + + case ELFIO::SHT_STRTAB: + // Don't clone section header string table, ELFIO will create a new one + if (section->get_name() == ".shstrtab") + return false; + return true; + + default: + return true; + } +} + +void cloneSections(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { + auto ogSections = ogExec.sections; + for (size_t i = 0; i < ogSections.size(); ++i) { + ELFIO::section *ogSection = ogSections[i]; + + if (!shouldClone(ogSection)) + continue; + + std::cout << "cloning\n"; + dumpSection(ogSection, false); + std::cout << '\n'; + + const std::string &name = ogSection->get_name(); + ELFIO::section *newSection = newExec.sections.add(name); + newSection->set_type(ogSection->get_type()); + newSection->set_flags(ogSection->get_flags()); + newSection->set_info(ogSection->get_info()); + + // NOTE: This can be incorrect link, and will be corrected later, after all + // sections are cloned. + newSection->set_link(ogSection->get_link()); + + newSection->set_addr_align(ogSection->get_addr_align()); + newSection->set_entry_size(ogSection->get_entry_size()); + newSection->set_address(ogSection->get_address()); + newSection->set_size(ogSection->get_size()); + + if (const char *contents = ogSection->get_data()) + newSection->set_data(contents, ogSection->get_size()); + + ogToNewSectionMap[ogSection] = newSection; + newToOgSectionMap[newSection] = ogSection; + } +} + +void correctSectionLinks(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { + auto ogSections = ogExec.sections; + auto newSections = newExec.sections; + + // If ogSection's sh_link holds index in ogExec's section header table, we + // must update newSection's sh_link hold corresponding index in newExec's + // section header table. + for (size_t i = 0; i < newSections.size(); ++i) { + auto *currentNewSection = newSections[i]; + + auto iter1 = newToOgSectionMap.find(currentNewSection); + if (iter1 == newToOgSectionMap.end()) + continue; + + auto *ogSection = iter1->second; + auto ogLinkSectionIdx = ogSection->get_link(); + auto *ogLinkSection = ogSections[ogLinkSectionIdx]; + + auto iter2 = ogToNewSectionMap.find(ogLinkSection); + if (iter2 == ogToNewSectionMap.end()) + continue; + + auto *newLinkSection = iter2->second; + auto newLinkSectionIdx = newLinkSection->get_index(); + currentNewSection->set_link(newLinkSectionIdx); + } +} + +void cloneSegments(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { + auto ogSegments = ogExec.segments; + for (size_t i = 0; i < ogSegments.size(); ++i) { + ELFIO::segment *ogSegment = ogSegments[i]; + ELFIO::segment *newSegment = newExec.segments.add(); + newSegment->set_type(ogSegment->get_type()); + newSegment->set_flags(ogSegment->get_flags()); + newSegment->set_align(ogSegment->get_align()); + newSegment->set_virtual_address(ogSegment->get_virtual_address()); + newSegment->set_physical_address(ogSegment->get_physical_address()); + + newSegment->set_file_size(ogSegment->get_file_size()); + newSegment->set_memory_size(ogSegment->get_memory_size()); + } + + auto newSegments = newExec.segments; + auto newSections = newExec.sections; + + // Now map new sections into new segments + for (size_t i = 0; i < newSections.size(); ++i) { + auto currSection = newSections[i]; + auto currSectionBegin = currSection->get_address(); + auto currSectionSize = currSection->get_size(); + auto currSectionEnd = currSectionBegin + currSectionSize; + + for (size_t j = 0; j < newSegments.size(); ++j) { + auto newSegmentBegin = newSegments[j]->get_virtual_address(); + auto newSegmentSize = newSegments[j]->get_memory_size(); + auto newSegmentEnd = newSegmentBegin + newSegmentSize; + bool c1 = currSectionBegin >= newSegmentBegin && currSectionBegin < newSegmentEnd; + bool c2 = currSectionEnd <= newSegmentEnd; + if (c1 && c2) { + newSegments[j]->add_section(currSection, 1); + } + } + } +} + +void cloneExec(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { + cloneHeader(ogExec, newExec); + cloneSections(ogExec, newExec); + correctSectionLinks(ogExec, newExec); + cloneSegments(ogExec, newExec); +} + +void updateFatbinAddr(ELFIO::elfio &execFile, uint64_t newAddr) { + ELFIO::section *fatbinWrapperSection = getFatbinWrapperSection(execFile); + + // address is at offset 8. + uint64_t *addrPtr = (uint64_t *)(fatbinWrapperSection->get_data() + 8); + *addrPtr = newAddr; +} + +// Create a .new_fatbin section, map it to a new PT_LOAD segment, update the +// fatbin wrapper. +void addNewFatbin(ELFIO::elfio &newExec, const char *newFatbinContent, size_t newFatbinSize) { + + ELFIO::section *fatbinSection = getFatbinSection(newExec); + assert(fatbinSection); + + // Calculate next virtual address for loading the new fatbin. + ELFIO::segment *lastSegment = getLastSegment(newExec); + size_t nextAddr = lastSegment->get_virtual_address() + lastSegment->get_memory_size(); + + size_t alignment = fatbinSection->get_addr_align(); + assert(alignment != 0); + + while (nextAddr % alignment != 0) { + ++nextAddr; + } + + ELFIO::section *newFatbinSection = newExec.sections.add(".new_fatbin"); + newFatbinSection->set_type(fatbinSection->get_type()); + newFatbinSection->set_flags(fatbinSection->get_flags()); + newFatbinSection->set_info(fatbinSection->get_info()); + newFatbinSection->set_addr_align(fatbinSection->get_addr_align()); + newFatbinSection->set_entry_size(fatbinSection->get_entry_size()); + newFatbinSection->set_size(newFatbinSize); + newFatbinSection->set_data(newFatbinContent, newFatbinSize); + newFatbinSection->set_address(nextAddr); + + ELFIO::segment *newSegment = newExec.segments.add(); + newSegment->set_type(ELFIO::PT_LOAD); + newSegment->set_flags(ELFIO::PF_R); + newSegment->set_align(fatbinSection->get_addr_align()); + newSegment->set_virtual_address(nextAddr); + newSegment->set_physical_address(nextAddr); + + newSegment->add_section(newFatbinSection, 1); + updateFatbinAddr(newExec, nextAddr); +} + +// This is for patching the clone at last. For some reason, editing raw segments +// doesn't work with ELFIO. The macros in elf.h conflict with ELFIO's constants, +// hence keeping the include here. +#include + +void patchExec(const char *rwExecPath) { + ELFIO::elfio newExecFile; + FILE *rawNewElf = fopen(rwExecPath, "rb+"); + + if (!rawNewElf || !newExecFile.load(rwExecPath)) { + std::cout << "can't find or process new ELF file " << rwExecPath << '\n'; + exit(1); + } + + ELFIO::segment *ptLoad1 = getPtLoad1(newExecFile); + uint64_t ptLoad1Offset = ptLoad1->get_offset(); + ELFIO::segment *phdrSeg = getPhdrSegment(newExecFile); + + char *ptLoad1Data = (char *)ptLoad1->get_data(); + char *pHdrs = (char *)phdrSeg->get_data(); + + size_t numZeroes = 0; + for (size_t i = 0; i < ptLoad1->get_memory_size() && ptLoad1Data[i] == 0; ++i) { + ++numZeroes; + } + + if (numZeroes < phdrSeg->get_memory_size()) { + std::cout << "can't patch final executable, please explicitly use ld to run it\n"; + exit(1); + } + + // Step 1. Copy program header table to beginning of PT_LOAD1. + std::cout << "Copying program header table to beginning of PT_LOAD1...\n"; + if (fseek(rawNewElf, ptLoad1Offset, SEEK_SET)) { + std::cout << "error going to " << ptLoad1Offset << '\n'; + exit(1); + } + std::cout << fwrite(pHdrs, sizeof(char), phdrSeg->get_file_size(), rawNewElf) + << " bytes written to PT_LOAD1\n"; + std::cout << '\n'; + + // Step 2. Update PT_LOAD1's program header (the one present in PT_LOAD1). + // Update p_vaddr to hold the address of PT_LOAD1 + assert(fseek(rawNewElf, ptLoad1Offset, SEEK_SET) == 0); + Elf64_Phdr progHeader; + std::cout << "Updating PT_LOAD1's program header in PT_LOAD1...\n"; + std::cout << fread(&progHeader, sizeof(Elf64_Phdr), 1, rawNewElf) + << " Elf64_Phdrs read from beginning of PT_LOAD1\n"; + + progHeader.p_vaddr = ptLoad1->get_virtual_address(); + progHeader.p_paddr = ptLoad1->get_physical_address(); + assert(fseek(rawNewElf, ptLoad1Offset, SEEK_SET) == 0); + std::cout << fwrite(&progHeader, sizeof(Elf64_Phdr), 1, rawNewElf) + << " Elf64_Phdrs written to beginning of PT_LOAD1\n"; + std::cout << '\n'; + + // Step 3. Update ELF header on disk. + // The offset of program header table should be offset of PT_LOAD1. + std::cout << "Updating ELF header's e_phoff to PT_LOAD1's offset...\n"; + Elf64_Ehdr elfHeader; + assert(fseek(rawNewElf, 0, SEEK_SET) == 0); + std::cout << (fread(&elfHeader, sizeof(Elf64_Ehdr), 1, rawNewElf)) + << " Elf64_Ehdrs read from beginning of " << rwExecPath << '\n'; + + std::cout << "old e_phoff : " << elfHeader.e_phoff << '\n'; + elfHeader.e_phoff = ptLoad1->get_offset(); + std::cout << "new e_phoff : " << elfHeader.e_phoff << '\n'; + + assert(fseek(rawNewElf, 0, SEEK_SET) == 0); + std::cout << fwrite(&elfHeader, sizeof(Elf64_Ehdr), 1, rawNewElf) + << " Elf64_Ehdrs written to beginning of " << rwExecPath << '\n'; + + fclose(rawNewElf); +} + +int main(int argc, char **argv) { + if (argc != 4) { + std::cout << "exactly 3 arguments to " << argv[0] << " expected\n"; + showHelp(argv[0]); + exit(1); + } + + const char *execFilePath = argv[1]; + const char *newFatbinPath = argv[2]; + const char *rwExecPath = argv[3]; + + ELFIO::elfio execFile; + ELFIO::elfio newExecFile; + std::ifstream newFatbin; + + if (!execFile.load(execFilePath)) { + std::cout << "can't find or process ELF file " << execFilePath << '\n'; + exit(1); + } + + ELFIO::section *fatbinSection = getFatbinSection(execFile); + if (!fatbinSection) { + std::cout << ".hip_fatbin section not found in " << execFilePath << "\n"; + exit(1); + } + + ELFIO::section *fatbinWrapperSection = getFatbinWrapperSection(execFile); + if (!fatbinWrapperSection) { + std::cout << ".hipFatBinSegment section not found in " << execFilePath << "\n"; + exit(1); + } + + cloneExec(execFile, newExecFile); + + size_t newFatbinSize = getFileSize(newFatbinPath); + char *newFatbinContent = new char[newFatbinSize]; + + newFatbin.open(newFatbinPath, std::ios::binary); + newFatbin.read(newFatbinContent, newFatbinSize); + addNewFatbin(newExecFile, newFatbinContent, newFatbinSize); + + delete[] newFatbinContent; + newFatbin.close(); + + std::cout << newExecFile.validate() << '\n'; + newExecFile.save(rwExecPath); + + // To ensure that the linux kernel loader picks up the program headers. + patchExec(rwExecPath); +} diff --git a/update-fatbin.cpp b/update-fatbin.cpp new file mode 100644 index 0000000..55098da --- /dev/null +++ b/update-fatbin.cpp @@ -0,0 +1,255 @@ +#include +#include +#include +#include +#include + +struct GpuBinInfo { + GpuBinInfo(const std::string &id_, uint64_t offset_, uint64_t size_) + : id(id_), offset(offset_), size(size_) {} + + std::string id; + uint64_t offset; + uint64_t size; + + void dump(std::ostream &os) { + os << "id : " << id << " offset : " << offset << " size : " << size << '\n'; + } +}; + +static void showHelp(const std::string &toolName) { + std::cerr << "Usage : " << toolName << " " + << " " + << "" << std::endl; + std::cerr << "supported architectures : gfx900, gfx906, gfx908, gfx90a, gfx940" << std::endl; + std::cerr << "This tool create a fat binary containing an instrumented GPU binary" << std::endl; +} + +static void getgpuBinInfos(const std::string &fatbinPath, std::vector &infos) { + std::ifstream fatbin(fatbinPath, std::ios::binary); + if (!fatbin) { + std::cerr << "error : can't open " << fatbinPath << std::endl; + exit(1); + } + + char buffer[24 + 1]; + fatbin.read(buffer, 24); + buffer[24] = 0; + + assert(std::string(buffer) == "__CLANG_OFFLOAD_BUNDLE__"); + + uint64_t numBundleEntries = 0; + fatbin.read(reinterpret_cast(&numBundleEntries), sizeof(numBundleEntries)); + + while (numBundleEntries) { + uint64_t bundleEntryCodeObjectOffset; // offset from begining of the fatbin + fatbin.read(reinterpret_cast(&bundleEntryCodeObjectOffset), + sizeof(bundleEntryCodeObjectOffset)); + + uint64_t size; + fatbin.read(reinterpret_cast(&size), sizeof(size)); + + uint64_t idLength; + fatbin.read(reinterpret_cast(&idLength), sizeof(idLength)); + + char id[idLength + 1]; + fatbin.read(id, idLength); + id[idLength] = 0; // Make id null-terminated + + GpuBinInfo info(id, bundleEntryCodeObjectOffset, size); + infos.push_back(info); + + numBundleEntries--; + } + + fatbin.close(); +} + +void dumpInfos(std::vector &infos) { + for (auto &info : infos) + info.dump(std::cout); +} + +static int getIndex(const std::string &arch, const std::vector &infos) { + int index = -1; + for (int i = 0; i < infos.size(); ++i) { + const GpuBinInfo &info = infos[i]; + size_t idLength = info.id.length(); + if (info.id.substr(idLength - arch.length()) == arch) { + index = i; + } + } + return index; +} + +static uint64_t alignUp(uint64_t value, uint64_t alignment) { + if (alignment <= 1) + return value; + + uint64_t diff = value % alignment; + return diff == 0 ? value : value + (alignment - diff); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + showHelp(argv[0]); + exit(1); + } + + std::string arch(argv[1]); + std::string elfBinPath(argv[2]); + std::string fatbinPath(argv[3]); + + std::vector gpuBinInfos; + getgpuBinInfos(fatbinPath, gpuBinInfos); + + int archIndex = getIndex(arch, gpuBinInfos); + if (archIndex == -1) { + std::cerr << fatbinPath << " doesn't contain a " << arch << " binary" << std::endl; + exit(1); + } + + std::ifstream elfBin(elfBinPath, std::ios::binary); + if (!elfBin) { + std::cerr << "error : can't open " << elfBinPath << std::endl; + exit(1); + } + + // Determine size of the elf binary and read it + elfBin.seekg(0, std::ios::end); + std::streampos pos = elfBin.tellg(); + std::streamoff offset = pos - std::streampos(0); + uint64_t elfBinSize = static_cast(offset); + + std::cout << "elfBinSize = " << elfBinSize << '\n'; + + char elfBinContents[elfBinSize]; + elfBin.seekg(0, std::ios::beg); + elfBin.read(elfBinContents, elfBinSize); + elfBin.close(); + + std::vector newBinInfos(gpuBinInfos); + + // If the binary we want to "replace" is followed by other binaries, we must + // update their offsets. Since all offsets are 0x1000 (i.e 4096) aligned we also + // respect the alignment when updating the offsets. + newBinInfos[archIndex].size = elfBinSize; + + for (int i = archIndex + 1; i < newBinInfos.size(); ++i) { + GpuBinInfo prevInfo = newBinInfos[i - 1]; + if (prevInfo.offset + prevInfo.size > newBinInfos[i].offset) { + newBinInfos[i].offset = alignUp(prevInfo.offset + prevInfo.size, 0x1000); + } + } + + // dumpInfos(gpuBinInfos); + // dumpInfos(newBinInfos); + + // Now we create a new fatbin + std::ofstream newFatbin(fatbinPath + ".updated", std::ios::binary); + if (!newFatbin) { + std::cerr << "error : can't open new fatbin" << std::endl; + exit(1); + } + + // "write" doesn't write null-terminated strings + // Magic string + std::string magicStr = "__CLANG_OFFLOAD_BUNDLE__"; + newFatbin.write(magicStr.c_str(), magicStr.size()); + + assert(gpuBinInfos.size() == newBinInfos.size()); + + // Number of bundle entries + uint64_t numBundleEntries = newBinInfos.size(); + newFatbin.write(reinterpret_cast(&numBundleEntries), sizeof(numBundleEntries)); + + // Write the following for each entry: + // offset + // size + // id length + // id + for (auto &info : newBinInfos) { + newFatbin.write(reinterpret_cast(&info.offset), sizeof(info.offset)); + newFatbin.write(reinterpret_cast(&info.size), sizeof(info.size)); + + uint64_t length = info.id.size(); + newFatbin.write(reinterpret_cast(&length), sizeof(length)); + newFatbin.write(info.id.c_str(), length); + } + + // Now we write the GPU objects + // 1. For each object upto archIndex, write padding, and copy contents from fatbin to newFatbin. + // Also assert that the offsets match what we computed. + // + // 2. Now write padding, and the updated gpubin provided as argument. + // + // 3. For each object after archIndex, write padding, and copy contents from fatbin to newFatbin. + // Also assert that the offsets match what we computed. + + std::ifstream fatbin(fatbinPath, std::ios::binary); + assert(fatbin); + + // Writing upto archIndex + for (int i = 0; i < archIndex; ++i) { + char buffer[gpuBinInfos[i].size]; + fatbin.seekg(gpuBinInfos[i].offset, std::ios::beg); + fatbin.read(buffer, gpuBinInfos[i].size); + + // Write padding before we start writing the ELF files + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + uint64_t paddingCount = alignUp(offset, 0x1000) - offset; + std::vector padding(paddingCount, 0); + newFatbin.write(padding.data(), paddingCount); + + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + // std::cout << offset << ' ' << gpuBinInfos[i].offset << '\n'; + assert(offset == newBinInfos[i].offset && + "Offset while writing ELF in new fatbin must match what we computed"); + newFatbin.write(buffer, gpuBinInfos[i].size); + } + + // The instrumented gpubin + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + uint64_t newBinPaddingCount = alignUp(offset, 0x1000) - offset; + std::cout << "padding for instrumented bin = " << newBinPaddingCount << " bytes\n"; + + std::vector newBinPadding(newBinPaddingCount, ' '); + newFatbin.write(newBinPadding.data(), newBinPaddingCount); + + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + std::cout << "writing instrumented bin at offset " << offset << '\n'; + newFatbin.write(elfBinContents, elfBinSize); + + // After archIndex + for (int i = archIndex + 1; i < gpuBinInfos.size(); ++i) { + char buffer[gpuBinInfos[i].size]; + fatbin.seekg(gpuBinInfos[i].offset, std::ios::beg); + fatbin.read(buffer, gpuBinInfos[i].size); + + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + uint64_t paddingCount = alignUp(offset, 0x1000) - offset; + std::vector padding(paddingCount, 0); + newFatbin.write(padding.data(), paddingCount); + + pos = newFatbin.tellp(); + offset = static_cast(pos - std::streampos(0)); + + // std::cout << offset << ' ' << gpuBinInfos[i].offset << ' ' << newBinInfos[i].offset << '\n'; + assert(offset == newBinInfos[i].offset && + "Offset while writing ELF in new fatbin must match what we computed"); + newFatbin.write(buffer, gpuBinInfos[i].size); + } + + fatbin.close(); + newFatbin.close(); +} diff --git a/update-note-phdr.cpp b/update-note-phdr.cpp new file mode 100644 index 0000000..de49b0d --- /dev/null +++ b/update-note-phdr.cpp @@ -0,0 +1,91 @@ +#include "elfio/elfio.hpp" + +#include + +// usage: +// update-note-phdr + +static void showHelp(const char *toolName) { + std::cout << "usage : \n"; + std::cout << " "; + std::cout << toolName << " \n\n"; +} + +static void dumpSection(const ELFIO::section *section, bool printContents = true) { + assert(section && "section must be non-null"); + + std::cout << "section : " << section->get_name() << ", "; + std::cout << "size : " << section->get_size() << ", "; + std::cout << "offset : " << section->get_offset() << ", "; + std::cout << "addr-align : " << section->get_addr_align() << ", "; + std::cout << "entry-size : " << section->get_entry_size() << '\n'; + + if (!printContents) + return; + + std::cout << "section contents :\n"; + + std::cout << std::hex; + for (int i = 0; i < section->get_size(); ++i) { + std::cout << (unsigned)section->get_data()[i] << ' '; + } + std::cout << std::dec << '\n'; +} + +ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { + for (int i = 0; i < file.sections.size(); ++i) { + if (file.sections[i]->get_name() == sectionName) + return file.sections[i]; + } + return nullptr; +} + +ELFIO::section *getNoteSection(const ELFIO::elfio &file) { return getSection(".note", file); } + +ELFIO::segment *getNoteSegment(const ELFIO::elfio &file) { + for (int i = 0; i < file.segments.size(); ++i) { + auto segment = file.segments[i]; + if (segment->get_type() == ELFIO::PT_NOTE) + return segment; + } + return nullptr; +} + +int main(int argc, char **argv) { + if (argc != 3) { + std::cout << "exactly 2 arguments to " << argv[0] << " expected\n"; + showHelp(argv[0]); + exit(1); + } + + const char *gpuBinPath = argv[1]; + const char *newGpuBinPath = argv[2]; + std::cerr << "saving file to " << newGpuBinPath << '\n'; + + ELFIO::elfio gpuBin; + + if (!gpuBin.load(gpuBinPath)) { + std::cout << "can't find or process ELF file " << gpuBinPath << '\n'; + exit(1); + } + + ELFIO::section *noteSection = getNoteSection(gpuBin); + if (!noteSection) { + std::cout << ".note section not found in " << gpuBinPath << "\n"; + exit(1); + } + + ELFIO::segment *noteSegment = getNoteSegment(gpuBin); + if (!noteSegment) { + std::cout << ".note segment not found in " << gpuBinPath << "\n"; + exit(1); + } + + noteSection->set_address(noteSegment->get_virtual_address()); + + noteSegment->add_section(noteSection, noteSection->get_addr_align()); + noteSegment->set_file_size(noteSection->get_size()); + noteSegment->set_memory_size(noteSection->get_size()); + + gpuBin.save(newGpuBinPath); +} diff --git a/update-note.cpp b/update-note.cpp new file mode 100644 index 0000000..f6a5f3d --- /dev/null +++ b/update-note.cpp @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include + + +// This tool updates metadata for instrumented kernels by: +// 1. Adding an additional argument for Dyninst's instrumentation variables +// 2. Maxing out SGPR allocation. + +static bool startsWith(const std::string &prefix, const std::string &str) { + if (prefix.length() > str.length()) + return false; + + return str.substr(0, prefix.length()) == prefix; +} + +struct KernelInfo { + std::string name; + unsigned newKernargBufferSize; + unsigned firstHiddenArgIndex; +}; + +// This number comes from LLVM AMDGPUUsage - https://llvm.org/docs/AMDGPUUsage.html +static constexpr uint32_t GFX908_MAX_SGPR_COUNT = 112; + +// Pointers need to be 8-byte aligned +static constexpr uint32_t PTR_ALIGNMENT = 8; + +// The extra argument for Dyninst's memory is a pointer of 8 bytes +static constexpr uint32_t DYNINST_ARG_SIZE = 8; + +// Create a new argument, which is pointer to the Dyninst's memory buffer for +// variables +void createNewArgument(std::map &newArgument, int offset, + msgpack::zone &z) { + newArgument[".name"] = msgpack::object(std::string("dyninst_mem"), z); + newArgument[".address_space"] = msgpack::object(std::string("global"), z); + newArgument[".offset"] = msgpack::object(offset, z); + newArgument[".size"] = msgpack::object(DYNINST_ARG_SIZE, z); + newArgument[".value_kind"] = msgpack::object(std::string("global_buffer"), z); + newArgument[".access"] = msgpack::object(std::string("read_write"), z); + + std::cerr << "created new argument with offset = " << offset << '\n'; +} + +// The argument list we create is just the signature in the metadata, which is the runtime uses +// to setup the actual kernel arguments. +// The runtime expects all regular arguments first in the signature, even if the argument comes +// after the hidden arguments in the kernarg. +static void createNewArgumentList(std::vector &ogArgumentList, + std::vector &newArgumentList, + unsigned newKernargBufferSize, msgpack::zone &z, KernelInfo &kernelInfo) { + std::map arg; + std::string valueKind; + int i = 0; + for (; i < ogArgumentList.size(); ++i) { + ogArgumentList[i].convert(arg); + msgpack::object valueKindObject = arg[".value_kind"]; + valueKindObject.convert(valueKind); + if (startsWith("hidden", valueKind)) { + break; + } + newArgumentList.push_back(ogArgumentList[i]); + } + + // Now we are at the first hidden arg. + assert(i < ogArgumentList.size() && startsWith("hidden", valueKind)); + kernelInfo.firstHiddenArgIndex = i; + + std::map newArg; + createNewArgument(newArg, newKernargBufferSize, z); + newArgumentList.push_back(msgpack::object(newArg, z)); + std::cerr << "added newArg to new list\n"; + + // Push other arguments + for (; i < ogArgumentList.size(); ++i) { + newArgumentList.push_back(ogArgumentList[i]); + } +} + +static std::string readNoteFile(const std::string &fileName) { + std::ifstream file(fileName); + std::stringstream buffer; + + buffer << file.rdbuf(); + buffer.seekg(0); + return std::string(buffer.str()); +} + +void rewriteNotes(const std::string &fileName, const std::string& newFileName, std::vector &instrumentedKernelInfos) { + // Step 1 - read .note file into buffer + std::string noteBuffer = readNoteFile(fileName); + + std::map metadataMap; + + // Each element represents the signature for a particular kernel. + // Each signature is a map. + std::vector kernelSignatures; + std::map kernelSignature; + + // argumentList is a vector of maps + std::vector argumentList; + + msgpack::zone z; + uint32_t offset = 0; + + // Step 2 - parse the ELF note header. This is not msgpack header. + // First 4 bytes : Size of the Name str (should be AMDGPU\0) + // Second 4 bytes : Size of the note in msgpack format + // Third 4 bytes : Type of the note (should be 32) + // Followed by the Name str + // Followed by padding until 4 byte aligned + // Followed by msgpack data + std::string name_szstr = noteBuffer.substr(0, 4); + uint32_t name_sz = *((uint32_t *)name_szstr.c_str()); + std::string noteType = noteBuffer.substr(8, 4); + std::string name = noteBuffer.substr(12, name_sz); + offset = 12 + name_sz; + while (offset % 4) + offset++; + + // Now we are ready to process the msgpack data + // We unpack from offset calculated above + // Unpack until we get to .group_segment_fixed_size + msgpack::object_handle objHandle; + objHandle = msgpack::unpack(noteBuffer.data() + offset, noteBuffer.size() - offset); + msgpack::object mapRoot = objHandle.get(); + mapRoot.convert(metadataMap); + metadataMap["amdhsa.kernels"].convert(kernelSignatures); + + // Go over each kernel entry and modify the argument list in the metadata for the + // instrumented ones + for (uint32_t i = 0; i < kernelSignatures.size(); i++) { + kernelSignatures[i].convert(kernelSignature); + + std::string kernelName = ""; + kernelSignature[".name"].convert(kernelName); + + auto iter = std::find_if(instrumentedKernelInfos.begin(), instrumentedKernelInfos.end(), + [&kernelName](const KernelInfo &KI) { return KI.name == kernelName; }); + + if (iter == instrumentedKernelInfos.end()) { + continue; + } + + kernelSignature[".args"].convert(argumentList); + std::vector newArgumentList; + uint32_t oldKernargSize = 0; + kernelSignature[".kernarg_segment_size"].convert(oldKernargSize); + + // Rounding up to alignment requirement + uint32_t newArgOffset= ((oldKernargSize + PTR_ALIGNMENT - 1) / PTR_ALIGNMENT) * PTR_ALIGNMENT; + + createNewArgumentList(argumentList, newArgumentList, newArgOffset, z, *iter); + kernelSignature[".args"] = msgpack::object(newArgumentList, z); + + // Dyninst already updated the kernarg size in the kernel descriptor. + // We picked that up when parsing kernelInfos + uint32_t newKernargSize = iter->newKernargBufferSize; + + assert(newArgOffset + DYNINST_ARG_SIZE == newKernargSize); + + kernelSignature[".kernarg_segment_size"] = msgpack::object(newKernargSize, z); + + // We also max out sgpr_count + kernelSignature[".sgpr_count"] = msgpack::object(GFX908_MAX_SGPR_COUNT, z); + + kernelSignatures[i] = msgpack::object(kernelSignature, z); + } + + metadataMap["amdhsa.kernels"] = msgpack::object(kernelSignatures, z); + + msgpack::sbuffer outBuffer; + msgpack::pack(outBuffer, metadataMap); + std::string outString = std::string(outBuffer.data(), outBuffer.size()); + + uint32_t outOffset; + uint32_t desc_sz = outBuffer.size(); + outOffset = name_sz; + + std::ofstream outFile; + outFile.open(newFileName, std::ios::binary); + // Write the headers + outFile.write(reinterpret_cast(&name_sz), sizeof(name_sz)); + outFile.write(reinterpret_cast(&desc_sz), sizeof(desc_sz)); + outFile.write(noteType.c_str(), noteType.size()); + outFile.write(name.c_str(), name.size()); + + // Padding + while (outOffset % 4 != 0) { + outFile.put('\0'); + outOffset += 1; + } + + // Write the msgpack data + outFile << outString; + outOffset += outString.size(); + + // Padding + while (outOffset % 4 != 0) { + outFile.put('\0'); + outOffset += 1; + } + outFile.close(); +} + +void readInstrumentedKernelInfos(const std::string &filePath, + std::vector &instrumentedKernelInfos) { + std::ifstream file(filePath); + std::string word; + + assert(file.is_open()); + + KernelInfo kernelInfo; + while (file >> kernelInfo.name >> kernelInfo.newKernargBufferSize) { + instrumentedKernelInfos.push_back(kernelInfo); + } + + file.close(); +} + +void writeUpdatedKernelInfos(const std::string &filePath, + std::vector &instrumentedKernelInfos) { + std::ofstream file(filePath); + assert(file.is_open()); + + for (auto const kernelInfo : instrumentedKernelInfos) { + file << kernelInfo.name << ' ' << kernelInfo.newKernargBufferSize << ' ' + << kernelInfo.firstHiddenArgIndex << '\n'; + } + file.close(); +} + +int main(int argc, char *argv[]) { + if (argc != 3) { + printf("usage expand_args <.names file> <.note file>\n"); + return -1; + } + + std::string namesFile(argv[1]); + std::string noteFile(argv[2]); + std::string updatedNoteFile(noteFile + ".expanded"); + + std::vector instrumentedKernelInfos; + readInstrumentedKernelInfos(namesFile, instrumentedKernelInfos); + + rewriteNotes(noteFile, updatedNoteFile, instrumentedKernelInfos); + + // The preload library will read this + std::string preloadNamesFile = namesFile + ".preload"; + writeUpdatedKernelInfos(preloadNamesFile, instrumentedKernelInfos); + + return 0; +} From 96921d0ca4466389835719e403f36b8e6ea3aebb Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 00:33:28 -0500 Subject: [PATCH 03/28] Update CMakeLists.txt for submodule directory names --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ab139c..890a774 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,15 +19,15 @@ project(amdgpu-tooling LANGUAGES CXX) add_executable(update-note update-note.cpp) target_include_directories( - update-note PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/msgpack/include) + update-note PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/msgpack-c/include) add_executable(update-note-phdr update-note-phdr.cpp) target_include_directories( - update-note-phdr PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + update-note-phdr PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) add_executable(extract-fatbin extract-fatbin.cpp) target_include_directories( - extract-fatbin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + extract-fatbin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) add_executable(extract-gpubin extract-gpubin.cpp) @@ -35,7 +35,7 @@ add_executable(update-fatbin update-fatbin.cpp) add_executable(update-exec update-exec.cpp) target_include_directories( - update-exec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/elfio-3.11) + update-exec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) # SPECIAL CASE FOR PRELOAD From cc44080f3766ed8e548178b660d6d2f2b02e6d75 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 00:43:23 -0500 Subject: [PATCH 04/28] Use static for functions in tools and add some comments --- extract-gpubin.cpp | 2 +- preload.cpp | 16 ++++++++-------- update-exec.cpp | 34 +++++++++++++++++----------------- update-fatbin.cpp | 2 +- update-note-phdr.cpp | 3 +++ 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/extract-gpubin.cpp b/extract-gpubin.cpp index c06dfd2..889919c 100644 --- a/extract-gpubin.cpp +++ b/extract-gpubin.cpp @@ -3,7 +3,7 @@ #include #include -void showHelp(const std::string &toolName) { +static void showHelp(const std::string &toolName) { std::cerr << "Usage : " << toolName << " " << "" << std::endl; std::cerr << "supported architectures : gfx900, gfx906, gfx908, gfx90a, gfx940" << std::endl; diff --git a/preload.cpp b/preload.cpp index 5b0b034..b90a597 100644 --- a/preload.cpp +++ b/preload.cpp @@ -12,10 +12,10 @@ #include // Environment variable for the instrumentation variable table path: -const char *instrumentationVariableTableEnv = "DYNINST_AMDGPU_INSTRUMENTATON_VAR_TABLE"; +static const char *instrumentationVariableTableEnv = "DYNINST_AMDGPU_INSTRUMENTATON_VAR_TABLE"; // Environment variable for the instrumented kernel names path: -const char *instrumentedKernelNamesEnv = "DYNINST_AMDGPU_INSTRUMENTED_KERNEL_NAMES"; +static const char *instrumentedKernelNamesEnv = "DYNINST_AMDGPU_INSTRUMENTED_KERNEL_NAMES"; // This will be used to print the names and values of the instrumentation variables after the kernel launch is done and the instrumentation variables are copied back. struct InstrumentationVarTableEntry { @@ -30,23 +30,23 @@ struct InstrumentationVarTableEntry { } }; -std::unordered_map &getKernargSizeMap() { +static std::unordered_map &getKernargSizeMap() { static std::unordered_map instance; return instance; } -std::unordered_map &getFirstHiddenArgIndexMap() { +static std::unordered_map &getFirstHiddenArgIndexMap() { static std::unordered_map instance; return instance; } -std::vector &getInstrumentationVarTableEntries() { +static std::vector &getInstrumentationVarTableEntries() { static std::vector instance; return instance; } // Read words from a string -void getWords(const std::string &str, std::vector &words) { +static void getWords(const std::string &str, std::vector &words) { std::stringstream ss(str); std::string word; while (ss >> word) { @@ -58,7 +58,7 @@ void getWords(const std::string &str, std::vector &words) { // offset -> instrumentation variable name // The table is sorted by offset -void readInstrumentedVarTable(const std::string &filePath) { +static void readInstrumentedVarTable(const std::string &filePath) { auto &tableEntries = getInstrumentationVarTableEntries(); std::ifstream tableFile(filePath); std::string line; @@ -84,7 +84,7 @@ void readInstrumentedVarTable(const std::string &filePath) { // We extend the kernel signature to take an additional argument, which is the memory holding // instrumentation variables. The map will be used to update the kernarg signature with a // bigger kernarg buffer size, to accomodate for the additional argument. -void readPreloadInfo(const std::string &filePath) { +static void readPreloadInfo(const std::string &filePath) { auto &kernargSizeMap = getKernargSizeMap(); auto &firstHiddenArgIndexMap = getFirstHiddenArgIndexMap(); diff --git a/update-exec.cpp b/update-exec.cpp index eaff109..9f5772c 100644 --- a/update-exec.cpp +++ b/update-exec.cpp @@ -14,8 +14,8 @@ // update-exec // These maps are for correcting the section links in the clone. -std::unordered_map ogToNewSectionMap; -std::unordered_map newToOgSectionMap; +static std::unordered_map ogToNewSectionMap; +static std::unordered_map newToOgSectionMap; static void showHelp(const char *toolName) { std::cout << "usage : \n"; @@ -47,7 +47,7 @@ static void dumpSection(const ELFIO::section *section, bool printContents = true // === SECTION-GETTING HELPERS BEGIN === // -ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { +static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { for (int i = 0; i < file.sections.size(); ++i) { if (file.sections[i]->get_name() == sectionName) return file.sections[i]; @@ -55,11 +55,11 @@ ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &f return nullptr; } -ELFIO::section *getFatbinSection(const ELFIO::elfio &file) { +static ELFIO::section *getFatbinSection(const ELFIO::elfio &file) { return getSection(".hip_fatbin", file); } -ELFIO::section *getFatbinWrapperSection(const ELFIO::elfio &file) { +static ELFIO::section *getFatbinWrapperSection(const ELFIO::elfio &file) { return getSection(".hipFatBinSegment", file); } // @@ -79,7 +79,7 @@ static size_t getFileSize(const std::string &filePath) { return size; } -ELFIO::segment *getPtLoad1(const ELFIO::elfio &file) { +static ELFIO::segment *getPtLoad1(const ELFIO::elfio &file) { for (int i = 0; i < file.segments.size(); ++i) { auto segment = file.segments[i]; if (segment->get_type() == ELFIO::PT_LOAD) @@ -88,7 +88,7 @@ ELFIO::segment *getPtLoad1(const ELFIO::elfio &file) { return nullptr; } -ELFIO::segment *getPhdrSegment(const ELFIO::elfio &file) { +static ELFIO::segment *getPhdrSegment(const ELFIO::elfio &file) { size_t entryPoint = file.get_entry(); for (int i = 0; i < file.segments.size(); ++i) { auto segment = file.segments[i]; @@ -98,7 +98,7 @@ ELFIO::segment *getPhdrSegment(const ELFIO::elfio &file) { return nullptr; } -ELFIO::segment *getLastSegment(const ELFIO::elfio &execFile) { +static ELFIO::segment *getLastSegment(const ELFIO::elfio &execFile) { const size_t numSegments = execFile.segments.size(); assert(numSegments != 0); @@ -121,7 +121,7 @@ ELFIO::segment *getLastSegment(const ELFIO::elfio &execFile) { return lastSegment; } -void cloneHeader(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { +static void cloneHeader(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { newExec.create(ogExec.get_class(), ogExec.get_encoding()); newExec.set_os_abi(ogExec.get_os_abi()); newExec.set_abi_version(ogExec.get_abi_version()); @@ -130,7 +130,7 @@ void cloneHeader(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { newExec.set_entry(ogExec.get_entry()); } -bool shouldClone(const ELFIO::section *section) { +static bool shouldClone(const ELFIO::section *section) { switch (section->get_type()) { case ELFIO::SHT_NULL: return false; @@ -146,7 +146,7 @@ bool shouldClone(const ELFIO::section *section) { } } -void cloneSections(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { +static void cloneSections(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { auto ogSections = ogExec.sections; for (size_t i = 0; i < ogSections.size(); ++i) { ELFIO::section *ogSection = ogSections[i]; @@ -181,7 +181,7 @@ void cloneSections(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { } } -void correctSectionLinks(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { +static void correctSectionLinks(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { auto ogSections = ogExec.sections; auto newSections = newExec.sections; @@ -209,7 +209,7 @@ void correctSectionLinks(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { } } -void cloneSegments(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { +static void cloneSegments(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { auto ogSegments = ogExec.segments; for (size_t i = 0; i < ogSegments.size(); ++i) { ELFIO::segment *ogSegment = ogSegments[i]; @@ -247,14 +247,14 @@ void cloneSegments(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { } } -void cloneExec(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { +static void cloneExec(const ELFIO::elfio &ogExec, ELFIO::elfio &newExec) { cloneHeader(ogExec, newExec); cloneSections(ogExec, newExec); correctSectionLinks(ogExec, newExec); cloneSegments(ogExec, newExec); } -void updateFatbinAddr(ELFIO::elfio &execFile, uint64_t newAddr) { +static void updateFatbinAddr(ELFIO::elfio &execFile, uint64_t newAddr) { ELFIO::section *fatbinWrapperSection = getFatbinWrapperSection(execFile); // address is at offset 8. @@ -264,7 +264,7 @@ void updateFatbinAddr(ELFIO::elfio &execFile, uint64_t newAddr) { // Create a .new_fatbin section, map it to a new PT_LOAD segment, update the // fatbin wrapper. -void addNewFatbin(ELFIO::elfio &newExec, const char *newFatbinContent, size_t newFatbinSize) { +static void addNewFatbin(ELFIO::elfio &newExec, const char *newFatbinContent, size_t newFatbinSize) { ELFIO::section *fatbinSection = getFatbinSection(newExec); assert(fatbinSection); @@ -306,7 +306,7 @@ void addNewFatbin(ELFIO::elfio &newExec, const char *newFatbinContent, size_t ne // hence keeping the include here. #include -void patchExec(const char *rwExecPath) { +static void patchExec(const char *rwExecPath) { ELFIO::elfio newExecFile; FILE *rawNewElf = fopen(rwExecPath, "rb+"); diff --git a/update-fatbin.cpp b/update-fatbin.cpp index 55098da..8d881a6 100644 --- a/update-fatbin.cpp +++ b/update-fatbin.cpp @@ -65,7 +65,7 @@ static void getgpuBinInfos(const std::string &fatbinPath, std::vector &infos) { +static void dumpInfos(std::vector &infos) { for (auto &info : infos) info.dump(std::cout); } diff --git a/update-note-phdr.cpp b/update-note-phdr.cpp index de49b0d..aea5e7e 100644 --- a/update-note-phdr.cpp +++ b/update-note-phdr.cpp @@ -5,6 +5,9 @@ // usage: // update-note-phdr +// This tool makes the note section loadable by setting its virtual address and mapping it to the +// note segment. + static void showHelp(const char *toolName) { std::cout << "usage : \n"; std::cout << " "; From 2c32ef22f196c71ed5ab238d2ba4c72efd567699 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 01:11:22 -0500 Subject: [PATCH 05/28] Add initial README --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..2960b25 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# AMDGPU Tools for Dyninst + +The Dyninst mutator can currently only rewrite the GPU ELF binary, but doesn't rewrite the metadata in the GPU binary. + +These tools are used alongside the Dyninst mutator for all the additional tasks. This includes the following: +1. Extracting and embedding the fat binary in the host executable +2. Extracting and embedding the GPU ELF binary in the fat binary +3. Rewriting metadata in the instrumented GPU binary +4. Using a preload library to pass additional argument for kernel launch + +These tools are tested and developed on ROCm 6.0.0 and GFX908. + +## Building + ```cmake /path/to/amd_gpu_tools -DROCM_PATH=/path/to/rocm/install``` From 3d4736481afe3f26d8406813b58f64e273d212f3 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 01:19:27 -0500 Subject: [PATCH 06/28] Update README.md --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2960b25..30694dd 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,16 @@ These tools are used alongside the Dyninst mutator for all the additional tasks. These tools are tested and developed on ROCm 6.0.0 and GFX908. ## Building - ```cmake /path/to/amd_gpu_tools -DROCM_PATH=/path/to/rocm/install``` + ``` + cmake /path/to/amd_gpu_tools -DROCM_PATH=/path/to/rocm/install + ``` + +## Running + + ``` + instr-driver + ``` + +The host executable contains the host code and the fat binary which contains device code. + +Ensure that the build directory for these tools is appended to `PATH` From b784ff8b3880fa10846831d7255cdab4b8933293 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 01:26:34 -0500 Subject: [PATCH 07/28] Use C++ 20 --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 890a774..2173c98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,7 @@ cmake_minimum_required(VERSION 3.20) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) set(ROCM_PATH "" From 605feced0a861422a1527929b4f255db7437e15f Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 01:26:59 -0500 Subject: [PATCH 08/28] use std::string::starts_with in update-note --- update-note.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/update-note.cpp b/update-note.cpp index f6a5f3d..c013720 100644 --- a/update-note.cpp +++ b/update-note.cpp @@ -11,13 +11,6 @@ // 1. Adding an additional argument for Dyninst's instrumentation variables // 2. Maxing out SGPR allocation. -static bool startsWith(const std::string &prefix, const std::string &str) { - if (prefix.length() > str.length()) - return false; - - return str.substr(0, prefix.length()) == prefix; -} - struct KernelInfo { std::string name; unsigned newKernargBufferSize; @@ -61,14 +54,14 @@ static void createNewArgumentList(std::vector &ogArgumentList, ogArgumentList[i].convert(arg); msgpack::object valueKindObject = arg[".value_kind"]; valueKindObject.convert(valueKind); - if (startsWith("hidden", valueKind)) { + if (valueKind.starts_with("hidden")) { break; } newArgumentList.push_back(ogArgumentList[i]); } // Now we are at the first hidden arg. - assert(i < ogArgumentList.size() && startsWith("hidden", valueKind)); + assert(i < ogArgumentList.size() && valueKind.starts_with("hidden")); kernelInfo.firstHiddenArgIndex = i; std::map newArg; From 6cb5b8eedd00be006aebf17ecda61daa771fb946 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 01:29:28 -0500 Subject: [PATCH 09/28] More use of static on functions --- update-note-phdr.cpp | 6 +++--- update-note.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/update-note-phdr.cpp b/update-note-phdr.cpp index aea5e7e..8514f66 100644 --- a/update-note-phdr.cpp +++ b/update-note-phdr.cpp @@ -35,7 +35,7 @@ static void dumpSection(const ELFIO::section *section, bool printContents = true std::cout << std::dec << '\n'; } -ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { +static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { for (int i = 0; i < file.sections.size(); ++i) { if (file.sections[i]->get_name() == sectionName) return file.sections[i]; @@ -43,9 +43,9 @@ ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &f return nullptr; } -ELFIO::section *getNoteSection(const ELFIO::elfio &file) { return getSection(".note", file); } +static ELFIO::section *getNoteSection(const ELFIO::elfio &file) { return getSection(".note", file); } -ELFIO::segment *getNoteSegment(const ELFIO::elfio &file) { +static ELFIO::segment *getNoteSegment(const ELFIO::elfio &file) { for (int i = 0; i < file.segments.size(); ++i) { auto segment = file.segments[i]; if (segment->get_type() == ELFIO::PT_NOTE) diff --git a/update-note.cpp b/update-note.cpp index c013720..a13fe6c 100644 --- a/update-note.cpp +++ b/update-note.cpp @@ -28,7 +28,7 @@ static constexpr uint32_t DYNINST_ARG_SIZE = 8; // Create a new argument, which is pointer to the Dyninst's memory buffer for // variables -void createNewArgument(std::map &newArgument, int offset, +static void createNewArgument(std::map &newArgument, int offset, msgpack::zone &z) { newArgument[".name"] = msgpack::object(std::string("dyninst_mem"), z); newArgument[".address_space"] = msgpack::object(std::string("global"), z); @@ -84,7 +84,7 @@ static std::string readNoteFile(const std::string &fileName) { return std::string(buffer.str()); } -void rewriteNotes(const std::string &fileName, const std::string& newFileName, std::vector &instrumentedKernelInfos) { +static void rewriteNotes(const std::string &fileName, const std::string& newFileName, std::vector &instrumentedKernelInfos) { // Step 1 - read .note file into buffer std::string noteBuffer = readNoteFile(fileName); @@ -201,7 +201,7 @@ void rewriteNotes(const std::string &fileName, const std::string& newFileName, s outFile.close(); } -void readInstrumentedKernelInfos(const std::string &filePath, +static void readInstrumentedKernelInfos(const std::string &filePath, std::vector &instrumentedKernelInfos) { std::ifstream file(filePath); std::string word; @@ -216,7 +216,7 @@ void readInstrumentedKernelInfos(const std::string &filePath, file.close(); } -void writeUpdatedKernelInfos(const std::string &filePath, +static void writeUpdatedKernelInfos(const std::string &filePath, std::vector &instrumentedKernelInfos) { std::ofstream file(filePath); assert(file.is_open()); From 0564a000cdbdc43f9d3e4ddf5860e7bdf45def74 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Tue, 21 Apr 2026 13:10:35 -0500 Subject: [PATCH 10/28] Use ROCM_PATH instead of /opt/rocm for preload --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2173c98..5c34df4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ set(PRELOAD_SO "${CMAKE_CURRENT_BINARY_DIR}/preload.so") add_custom_command( OUTPUT "${PRELOAD_SO}" COMMAND ${HIPCC} -D__HIP_PLATFORM_AMD__ -x c++ -shared -fpic - -I/opt/rocm-6.0.0/include/ "${PRELOAD_SOURCE}" -o "${PRELOAD_SO}" + -I"${ROCM_PATH}"/include/ "${PRELOAD_SOURCE}" -o "${PRELOAD_SO}" DEPENDS "${PRELOAD_SOURCE}" COMMENT "Building ${PRELOAD_SOURCE} with ${HIPCC}" VERBATIM) From b072a34e2cb040cecfa2f04cb6f7c4439bae847d Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 24 Apr 2026 12:41:50 -0500 Subject: [PATCH 11/28] Address comments to extract-fatbin --- extract-fatbin.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extract-fatbin.cpp b/extract-fatbin.cpp index 654ef1e..eff5765 100644 --- a/extract-fatbin.cpp +++ b/extract-fatbin.cpp @@ -24,13 +24,13 @@ int main(int argc, char **argv) { ELFIO::elfio execFile; if (!execFile.load(argv[1])) { - std::cout << "can't find or process ELF file " << argv[1] << '\n'; + std::cerr << "can't find or process ELF file " << argv[1] << '\n'; exit(1); } ELFIO::section *fatbinSection = getFatbinSection(execFile); if (!fatbinSection) { - std::cout << ".hip_fatbin section not found in " << argv[1] << "\n"; + std::cerr << ".hip_fatbin section not found in " << argv[1] << "\n"; exit(1); } @@ -38,7 +38,6 @@ int main(int argc, char **argv) { std::ofstream fatbinFile(std::string(argv[1]) + ".fatbin", std::ios::out | std::ios::binary); fatbinFile.write(fatbinSection->get_data(), fatbinSection->get_size()); - fatbinFile.close(); return 0; } From 9f86d247bde990d47c9a4233596e2fe683eb508a Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 24 Apr 2026 13:00:54 -0500 Subject: [PATCH 12/28] Address comments to extract-gpubin --- extract-gpubin.cpp | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/extract-gpubin.cpp b/extract-gpubin.cpp index 889919c..ff32060 100644 --- a/extract-gpubin.cpp +++ b/extract-gpubin.cpp @@ -24,11 +24,16 @@ int main(int argc, char *argv[]) { exit(1); } - char buffer[24 + 1]; - fatbin.read(buffer, 24); - buffer[24] = 0; + // This is at the beginning of the clang-offload-bundle file. + // See https://clang.llvm.org/docs/ClangOffloadBundler.html + constexpr std::string_view magicString("__CLANG_OFFLOAD_BUNDLE__"); + constexpr uint32_t magicStringLength = magicString.length(); - assert(std::string(buffer) == "__CLANG_OFFLOAD_BUNDLE__"); + char buffer[magicStringLength + 1]; + fatbin.read(buffer, magicStringLength); + buffer[magicStringLength] = 0; + + assert(std::string(buffer) == magicString); uint64_t numBundleEntries = 0; fatbin.read(reinterpret_cast(&numBundleEntries), sizeof(numBundleEntries)); @@ -49,10 +54,10 @@ int main(int argc, char *argv[]) { uint64_t idLength; fatbin.read(reinterpret_cast(&idLength), sizeof(idLength)); - char id[idLength]; - fatbin.read(id, idLength); + std::string idString; + idString.resize(idLength); + fatbin.read(&idString[0], idLength); - std::string idString(id); // If idString ends with arch if (idString.substr(idLength - arch.length()) == arch) { elfStart = bundleEntryCodeObjectOffset; @@ -64,14 +69,15 @@ int main(int argc, char *argv[]) { if (!found) { std::cerr << fatbinPath << " doesn't contain a " << arch << " binary\n"; - exit(0); + exit(1); } // std::cout << arch << ' ' << "ELF at " << elfStart << " of size " << elfSize << '\n'; fatbin.seekg(elfStart, std::ios::beg); - char data[elfSize]; - fatbin.read(data, elfSize); + std::string data; + data.resize(elfSize); + fatbin.read(&data[0], elfSize); std::string elfBinPath(fatbinPath + "." + arch); std::ofstream elfBin(elfBinPath, std::ios::binary); @@ -81,8 +87,5 @@ int main(int argc, char *argv[]) { exit(1); } - elfBin.write(data, elfSize); - elfBin.close(); - - fatbin.close(); + elfBin.write(&data[0], elfSize); } From 43198b0c609d116783444a373edb72c0f03e7952 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 13:09:51 -0500 Subject: [PATCH 13/28] Use SYSTEM keyword to eliminate third party code warnings --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c34df4..4bcf711 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,15 +22,15 @@ project(amdgpu-tooling LANGUAGES CXX) add_executable(update-note update-note.cpp) target_include_directories( - update-note PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/msgpack-c/include) + update-note SYSTEM PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/msgpack-c/include) add_executable(update-note-phdr update-note-phdr.cpp) target_include_directories( - update-note-phdr PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) + update-note-phdr SYSTEM PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) add_executable(extract-fatbin extract-fatbin.cpp) target_include_directories( - extract-fatbin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) + extract-fatbin SYSTEM PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) add_executable(extract-gpubin extract-gpubin.cpp) @@ -38,7 +38,7 @@ add_executable(update-fatbin update-fatbin.cpp) add_executable(update-exec update-exec.cpp) target_include_directories( - update-exec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) + update-exec SYSTEM PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ELFIO) # SPECIAL CASE FOR PRELOAD From 1ca986c24feb775494a7f4ec7137df5c3b573df4 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 13:10:20 -0500 Subject: [PATCH 14/28] Rename project to amd-gpu-tools in CMake --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bcf711..c6a1d63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ endif() set(CMAKE_C_COMPILER "${ROCM_PATH}/bin/amdclang") set(CMAKE_CXX_COMPILER "${ROCM_PATH}/bin/amdclang++") -project(amdgpu-tooling LANGUAGES CXX) +project(amd-gpu-tools LANGUAGES CXX) # ALL REGULAR TOOLS add_executable(update-note update-note.cpp) From 3512594a2ca43f4c3dd5de383c22b3b8319e5eb5 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 13:44:15 -0500 Subject: [PATCH 15/28] Remove use of VLA in update-fatbin --- update-fatbin.cpp | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/update-fatbin.cpp b/update-fatbin.cpp index 8d881a6..089a64e 100644 --- a/update-fatbin.cpp +++ b/update-fatbin.cpp @@ -4,6 +4,10 @@ #include #include +// Magic string at the beginning of the bundle +static std::string magicStr("__CLANG_OFFLOAD_BUNDLE__"); + + struct GpuBinInfo { GpuBinInfo(const std::string &id_, uint64_t offset_, uint64_t size_) : id(id_), offset(offset_), size(size_) {} @@ -32,11 +36,11 @@ static void getgpuBinInfos(const std::string &fatbinPath, std::vector(&numBundleEntries), sizeof(numBundleEntries)); @@ -52,9 +56,9 @@ static void getgpuBinInfos(const std::string &fatbinPath, std::vector(&idLength), sizeof(idLength)); - char id[idLength + 1]; - fatbin.read(id, idLength); - id[idLength] = 0; // Make id null-terminated + std::string id; + id.resize(idLength); + fatbin.read(&id[0], idLength); GpuBinInfo info(id, bundleEntryCodeObjectOffset, size); infos.push_back(info); @@ -123,9 +127,10 @@ int main(int argc, char *argv[]) { std::cout << "elfBinSize = " << elfBinSize << '\n'; - char elfBinContents[elfBinSize]; + std::string elfBinContents; + elfBinContents.resize(elfBinSize); elfBin.seekg(0, std::ios::beg); - elfBin.read(elfBinContents, elfBinSize); + elfBin.read(&elfBinContents[0], elfBinSize); elfBin.close(); std::vector newBinInfos(gpuBinInfos); @@ -153,8 +158,6 @@ int main(int argc, char *argv[]) { } // "write" doesn't write null-terminated strings - // Magic string - std::string magicStr = "__CLANG_OFFLOAD_BUNDLE__"; newFatbin.write(magicStr.c_str(), magicStr.size()); assert(gpuBinInfos.size() == newBinInfos.size()); @@ -191,9 +194,10 @@ int main(int argc, char *argv[]) { // Writing upto archIndex for (int i = 0; i < archIndex; ++i) { - char buffer[gpuBinInfos[i].size]; + std::string buffer; + buffer.resize(gpuBinInfos[i].size); fatbin.seekg(gpuBinInfos[i].offset, std::ios::beg); - fatbin.read(buffer, gpuBinInfos[i].size); + fatbin.read(&buffer[0], gpuBinInfos[i].size); // Write padding before we start writing the ELF files pos = newFatbin.tellp(); @@ -209,7 +213,7 @@ int main(int argc, char *argv[]) { // std::cout << offset << ' ' << gpuBinInfos[i].offset << '\n'; assert(offset == newBinInfos[i].offset && "Offset while writing ELF in new fatbin must match what we computed"); - newFatbin.write(buffer, gpuBinInfos[i].size); + newFatbin.write(buffer.c_str(), gpuBinInfos[i].size); } // The instrumented gpubin @@ -226,7 +230,7 @@ int main(int argc, char *argv[]) { offset = static_cast(pos - std::streampos(0)); std::cout << "writing instrumented bin at offset " << offset << '\n'; - newFatbin.write(elfBinContents, elfBinSize); + newFatbin.write(elfBinContents.c_str(), elfBinSize); // After archIndex for (int i = archIndex + 1; i < gpuBinInfos.size(); ++i) { From 466bc55b98dd1751e98d4eb4b7035874f1ee0d49 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 13:57:40 -0500 Subject: [PATCH 16/28] Modernize for loop in getSection --- extract-fatbin.cpp | 7 ++++--- update-exec.cpp | 7 ++++--- update-note-phdr.cpp | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/extract-fatbin.cpp b/extract-fatbin.cpp index eff5765..d8d8f27 100644 --- a/extract-fatbin.cpp +++ b/extract-fatbin.cpp @@ -5,9 +5,10 @@ #include "elfio/elfio.hpp" static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { - for (int i = 0; i < file.sections.size(); ++i) { - if (file.sections[i]->get_name() == sectionName) - return file.sections[i]; + for (const auto §ion: file.sections) { + if (section->get_name() == sectionName) { + return section.get(); + } } return nullptr; } diff --git a/update-exec.cpp b/update-exec.cpp index 9f5772c..4a467f6 100644 --- a/update-exec.cpp +++ b/update-exec.cpp @@ -48,9 +48,10 @@ static void dumpSection(const ELFIO::section *section, bool printContents = true // === SECTION-GETTING HELPERS BEGIN === // static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { - for (int i = 0; i < file.sections.size(); ++i) { - if (file.sections[i]->get_name() == sectionName) - return file.sections[i]; + for (const auto §ion: file.sections) { + if (section->get_name() == sectionName) { + return section.get(); + } } return nullptr; } diff --git a/update-note-phdr.cpp b/update-note-phdr.cpp index 8514f66..2faaf28 100644 --- a/update-note-phdr.cpp +++ b/update-note-phdr.cpp @@ -36,9 +36,10 @@ static void dumpSection(const ELFIO::section *section, bool printContents = true } static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { - for (int i = 0; i < file.sections.size(); ++i) { - if (file.sections[i]->get_name() == sectionName) - return file.sections[i]; + for (const auto §ion: file.sections) { + if (section->get_name() == sectionName) { + return section.get(); + } } return nullptr; } From c95366e0d644dc89cfc962c872efb052688c9ffd Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 14:00:35 -0500 Subject: [PATCH 17/28] Enable more compiler warnings --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6a1d63..ab2701c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,9 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_FLAGS "-Wall -Wextra -pedantic") +set(CMAKE_C_FLAGS "-Wall -Wextra -pedantic") + set(ROCM_PATH "" CACHE PATH "Path to ROCm install directory") From 3a428a0b71eb774bf61468e0eaf3cfae701fbc30 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 14:04:36 -0500 Subject: [PATCH 18/28] Fix new warnings in update-note-phdr --- update-note-phdr.cpp | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/update-note-phdr.cpp b/update-note-phdr.cpp index 2faaf28..73a1190 100644 --- a/update-note-phdr.cpp +++ b/update-note-phdr.cpp @@ -14,26 +14,29 @@ static void showHelp(const char *toolName) { std::cout << toolName << " \n\n"; } -static void dumpSection(const ELFIO::section *section, bool printContents = true) { - assert(section && "section must be non-null"); - - std::cout << "section : " << section->get_name() << ", "; - std::cout << "size : " << section->get_size() << ", "; - std::cout << "offset : " << section->get_offset() << ", "; - std::cout << "addr-align : " << section->get_addr_align() << ", "; - std::cout << "entry-size : " << section->get_entry_size() << '\n'; - - if (!printContents) - return; - - std::cout << "section contents :\n"; - - std::cout << std::hex; - for (int i = 0; i < section->get_size(); ++i) { - std::cout << (unsigned)section->get_data()[i] << ' '; - } - std::cout << std::dec << '\n'; -} +// Commenting out to prevent unused function warning as this can be used later if we +// add log levels for debugging +// +// static void dumpSection(const ELFIO::section *section, bool printContents = true) { +// assert(section && "section must be non-null"); +// +// std::cout << "section : " << section->get_name() << ", "; +// std::cout << "size : " << section->get_size() << ", "; +// std::cout << "offset : " << section->get_offset() << ", "; +// std::cout << "addr-align : " << section->get_addr_align() << ", "; +// std::cout << "entry-size : " << section->get_entry_size() << '\n'; +// +// if (!printContents) +// return; +// +// std::cout << "section contents :\n"; +// +// std::cout << std::hex; +// for (size_t i = 0; i < section->get_size(); ++i) { +// std::cout << (unsigned)section->get_data()[i] << ' '; +// } +// std::cout << std::dec << '\n'; +// } static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { for (const auto §ion: file.sections) { From 33197e31a5cbaa6fbf7d8182619479c67e02d136 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 14:06:52 -0500 Subject: [PATCH 19/28] Fix warnings in update-notes --- update-note.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/update-note.cpp b/update-note.cpp index a13fe6c..858f2b6 100644 --- a/update-note.cpp +++ b/update-note.cpp @@ -49,7 +49,7 @@ static void createNewArgumentList(std::vector &ogArgumentList, unsigned newKernargBufferSize, msgpack::zone &z, KernelInfo &kernelInfo) { std::map arg; std::string valueKind; - int i = 0; + size_t i = 0; for (; i < ogArgumentList.size(); ++i) { ogArgumentList[i].convert(arg); msgpack::object valueKindObject = arg[".value_kind"]; @@ -221,7 +221,7 @@ static void writeUpdatedKernelInfos(const std::string &filePath, std::ofstream file(filePath); assert(file.is_open()); - for (auto const kernelInfo : instrumentedKernelInfos) { + for (auto const &kernelInfo : instrumentedKernelInfos) { file << kernelInfo.name << ' ' << kernelInfo.newKernargBufferSize << ' ' << kernelInfo.firstHiddenArgIndex << '\n'; } From 6ccb09718284e9140b7f33f223875f0d18119e83 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 16:08:24 -0500 Subject: [PATCH 20/28] Fix warnings in update-fatbin --- update-fatbin.cpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/update-fatbin.cpp b/update-fatbin.cpp index 089a64e..28da3a0 100644 --- a/update-fatbin.cpp +++ b/update-fatbin.cpp @@ -69,14 +69,18 @@ static void getgpuBinInfos(const std::string &fatbinPath, std::vector &infos) { - for (auto &info : infos) - info.dump(std::cout); -} +// Commenting out to prevent unused function warning as this can be used later if we +// add log levels for debugging +// +// static void dumpInfos(std::vector &infos) { +// for (auto &info : infos) +// info.dump(std::cout); +// } static int getIndex(const std::string &arch, const std::vector &infos) { int index = -1; - for (int i = 0; i < infos.size(); ++i) { + int infosLength = static_cast(infos.size()); + for (int i = 0; i < infosLength; ++i) { const GpuBinInfo &info = infos[i]; size_t idLength = info.id.length(); if (info.id.substr(idLength - arch.length()) == arch) { @@ -140,7 +144,7 @@ int main(int argc, char *argv[]) { // respect the alignment when updating the offsets. newBinInfos[archIndex].size = elfBinSize; - for (int i = archIndex + 1; i < newBinInfos.size(); ++i) { + for (size_t i = archIndex + 1; i < newBinInfos.size(); ++i) { GpuBinInfo prevInfo = newBinInfos[i - 1]; if (prevInfo.offset + prevInfo.size > newBinInfos[i].offset) { newBinInfos[i].offset = alignUp(prevInfo.offset + prevInfo.size, 0x1000); @@ -201,17 +205,17 @@ int main(int argc, char *argv[]) { // Write padding before we start writing the ELF files pos = newFatbin.tellp(); - offset = static_cast(pos - std::streampos(0)); + offset = static_cast(pos - std::streampos(0)); uint64_t paddingCount = alignUp(offset, 0x1000) - offset; std::vector padding(paddingCount, 0); newFatbin.write(padding.data(), paddingCount); pos = newFatbin.tellp(); - offset = static_cast(pos - std::streampos(0)); + offset = static_cast(pos - std::streampos(0)); // std::cout << offset << ' ' << gpuBinInfos[i].offset << '\n'; - assert(offset == newBinInfos[i].offset && + assert(static_cast(offset) == newBinInfos[i].offset && "Offset while writing ELF in new fatbin must match what we computed"); newFatbin.write(buffer.c_str(), gpuBinInfos[i].size); } @@ -233,10 +237,11 @@ int main(int argc, char *argv[]) { newFatbin.write(elfBinContents.c_str(), elfBinSize); // After archIndex - for (int i = archIndex + 1; i < gpuBinInfos.size(); ++i) { - char buffer[gpuBinInfos[i].size]; + for (size_t i = archIndex + 1; i < gpuBinInfos.size(); ++i) { + std::string buffer; + buffer.resize(gpuBinInfos[i].size); fatbin.seekg(gpuBinInfos[i].offset, std::ios::beg); - fatbin.read(buffer, gpuBinInfos[i].size); + fatbin.read(&buffer[0], gpuBinInfos[i].size); pos = newFatbin.tellp(); offset = static_cast(pos - std::streampos(0)); @@ -249,9 +254,9 @@ int main(int argc, char *argv[]) { offset = static_cast(pos - std::streampos(0)); // std::cout << offset << ' ' << gpuBinInfos[i].offset << ' ' << newBinInfos[i].offset << '\n'; - assert(offset == newBinInfos[i].offset && + assert(static_cast(offset) == newBinInfos[i].offset && "Offset while writing ELF in new fatbin must match what we computed"); - newFatbin.write(buffer, gpuBinInfos[i].size); + newFatbin.write(buffer.c_str(), gpuBinInfos[i].size); } fatbin.close(); From 9e44c0548a29d2c58a34b23e49578ace6bb34352 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 16:10:56 -0500 Subject: [PATCH 21/28] Fix warnigns in update-exec --- update-exec.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/update-exec.cpp b/update-exec.cpp index 4a467f6..42bee3d 100644 --- a/update-exec.cpp +++ b/update-exec.cpp @@ -39,7 +39,7 @@ static void dumpSection(const ELFIO::section *section, bool printContents = true std::cout << "section contents :\n"; std::cout << std::hex; - for (int i = 0; i < section->get_size(); ++i) { + for (size_t i = 0; i < section->get_size(); ++i) { std::cout << (unsigned)section->get_data()[i] << ' '; } std::cout << std::dec << '\n'; @@ -90,7 +90,6 @@ static ELFIO::segment *getPtLoad1(const ELFIO::elfio &file) { } static ELFIO::segment *getPhdrSegment(const ELFIO::elfio &file) { - size_t entryPoint = file.get_entry(); for (int i = 0; i < file.segments.size(); ++i) { auto segment = file.segments[i]; if (segment->get_type() == ELFIO::PT_PHDR) From f516d736457c20115bdd8daa25ca1fd05dd4ce80 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Thu, 30 Apr 2026 16:16:37 -0500 Subject: [PATCH 22/28] Use constexpr for alignment --- update-fatbin.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/update-fatbin.cpp b/update-fatbin.cpp index 28da3a0..f1db474 100644 --- a/update-fatbin.cpp +++ b/update-fatbin.cpp @@ -142,12 +142,14 @@ int main(int argc, char *argv[]) { // If the binary we want to "replace" is followed by other binaries, we must // update their offsets. Since all offsets are 0x1000 (i.e 4096) aligned we also // respect the alignment when updating the offsets. + + constexpr uint64_t alignment = 0x1000; newBinInfos[archIndex].size = elfBinSize; for (size_t i = archIndex + 1; i < newBinInfos.size(); ++i) { GpuBinInfo prevInfo = newBinInfos[i - 1]; if (prevInfo.offset + prevInfo.size > newBinInfos[i].offset) { - newBinInfos[i].offset = alignUp(prevInfo.offset + prevInfo.size, 0x1000); + newBinInfos[i].offset = alignUp(prevInfo.offset + prevInfo.size, alignment); } } @@ -207,7 +209,7 @@ int main(int argc, char *argv[]) { pos = newFatbin.tellp(); offset = static_cast(pos - std::streampos(0)); - uint64_t paddingCount = alignUp(offset, 0x1000) - offset; + uint64_t paddingCount = alignUp(offset, alignment) - offset; std::vector padding(paddingCount, 0); newFatbin.write(padding.data(), paddingCount); @@ -224,7 +226,7 @@ int main(int argc, char *argv[]) { pos = newFatbin.tellp(); offset = static_cast(pos - std::streampos(0)); - uint64_t newBinPaddingCount = alignUp(offset, 0x1000) - offset; + uint64_t newBinPaddingCount = alignUp(offset, alignment) - offset; std::cout << "padding for instrumented bin = " << newBinPaddingCount << " bytes\n"; std::vector newBinPadding(newBinPaddingCount, ' '); @@ -246,7 +248,7 @@ int main(int argc, char *argv[]) { pos = newFatbin.tellp(); offset = static_cast(pos - std::streampos(0)); - uint64_t paddingCount = alignUp(offset, 0x1000) - offset; + uint64_t paddingCount = alignUp(offset, alignment) - offset; std::vector padding(paddingCount, 0); newFatbin.write(padding.data(), paddingCount); From e92b7fd5bace393d7194e78a3dc51952536e1298 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 14:45:03 -0500 Subject: [PATCH 23/28] Update extract-fatbin to take output argument --- extract-fatbin.cpp | 17 +++++++++++------ instr-driver | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/extract-fatbin.cpp b/extract-fatbin.cpp index d8d8f27..e946e0f 100644 --- a/extract-fatbin.cpp +++ b/extract-fatbin.cpp @@ -4,6 +4,9 @@ #include "elfio/elfio.hpp" +// usage: +// extract-fatbin + static ELFIO::section *getSection(const std::string §ionName, const ELFIO::elfio &file) { for (const auto §ion: file.sections) { if (section->get_name() == sectionName) { @@ -18,25 +21,27 @@ static ELFIO::section *getFatbinSection(const ELFIO::elfio &file) { } int main(int argc, char **argv) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } + std::string execFilePath(argv[1]); + std::string outputFatbinPath(argv[2]); ELFIO::elfio execFile; - if (!execFile.load(argv[1])) { - std::cerr << "can't find or process ELF file " << argv[1] << '\n'; + if (!execFile.load(execFilePath)) { + std::cerr << "can't find or process ELF file " << execFilePath << '\n'; exit(1); } ELFIO::section *fatbinSection = getFatbinSection(execFile); if (!fatbinSection) { - std::cerr << ".hip_fatbin section not found in " << argv[1] << "\n"; + std::cerr << ".hip_fatbin section not found in " << execFilePath << "\n"; exit(1); } // Write fatbin to a separate file - std::ofstream fatbinFile(std::string(argv[1]) + ".fatbin", std::ios::out | std::ios::binary); + std::ofstream fatbinFile(outputFatbinPath, std::ios::out | std::ios::binary); fatbinFile.write(fatbinSection->get_data(), fatbinSection->get_size()); diff --git a/instr-driver b/instr-driver index 78ee59f..2a52a68 100755 --- a/instr-driver +++ b/instr-driver @@ -26,7 +26,7 @@ NOTE_IN=$GPUBIN.note NOTE_OUT=$NOTE_IN.expanded # 1. Extract fatbin. This will output a $FATBIN -extract-fatbin $EXEC_IN +extract-fatbin $EXEC_IN $FATBIN # 2. Extract gfx908 bin. This will output $GPUBIN extract-gpubin gfx908 $FATBIN From 7cc253f210187f7d47073e92a85e75559ff0dba2 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 14:50:01 -0500 Subject: [PATCH 24/28] Update extract-gpubin to take output argument --- extract-gpubin.cpp | 10 +++++----- instr-driver | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/extract-gpubin.cpp b/extract-gpubin.cpp index ff32060..9e0967f 100644 --- a/extract-gpubin.cpp +++ b/extract-gpubin.cpp @@ -5,18 +5,19 @@ static void showHelp(const std::string &toolName) { std::cerr << "Usage : " << toolName << " " - << "" << std::endl; + << "" << " " << std::endl; std::cerr << "supported architectures : gfx900, gfx906, gfx908, gfx90a, gfx940" << std::endl; } int main(int argc, char *argv[]) { - if (argc != 3) { + if (argc != 4) { showHelp(argv[0]); exit(1); } std::string arch(argv[1]); std::string fatbinPath(argv[2]); + std::string gpubinPath(argv[3]); std::ifstream fatbin(fatbinPath, std::ios::binary); if (!fatbin) { @@ -79,11 +80,10 @@ int main(int argc, char *argv[]) { data.resize(elfSize); fatbin.read(&data[0], elfSize); - std::string elfBinPath(fatbinPath + "." + arch); - std::ofstream elfBin(elfBinPath, std::ios::binary); + std::ofstream elfBin(gpubinPath, std::ios::binary); if (!elfBin) { - std::cerr << "error : can't create " << elfBinPath << std::endl; + std::cerr << "error : can't create " << gpubinPath << std::endl; exit(1); } diff --git a/instr-driver b/instr-driver index 2a52a68..dd8db8b 100755 --- a/instr-driver +++ b/instr-driver @@ -29,7 +29,7 @@ NOTE_OUT=$NOTE_IN.expanded extract-fatbin $EXEC_IN $FATBIN # 2. Extract gfx908 bin. This will output $GPUBIN -extract-gpubin gfx908 $FATBIN +extract-gpubin gfx908 $FATBIN $GPUBIN # 3. Run the mutator, instrument kernels (also use the information from step 3). # This will also emit a file containing list of instrumented kernels ($NAMES_FILE) From 0593154611b509085884834f415df6fafa2ebbd8 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 14:54:10 -0500 Subject: [PATCH 25/28] Update update-fatbin to take output argument --- instr-driver | 2 +- update-fatbin.cpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/instr-driver b/instr-driver index dd8db8b..1da6005 100755 --- a/instr-driver +++ b/instr-driver @@ -63,7 +63,7 @@ update-note-phdr $GPUBIN_UPDATED_NOTE $GPUBIN_FINAL # 6. Update original fatbin with instrumented gpu binary ($GPUBIN_FINAL) # This will emit $FATBIN_UPDATED -update-fatbin gfx908 $GPUBIN_FINAL $FATBIN +update-fatbin gfx908 $GPUBIN_FINAL $FATBIN $FATBIN_UPDATED # 7. Update the original executable ($EXEC_IN) by embedding $FATBIN_UPDATED # This will emit $EXEC_UPDATED diff --git a/update-fatbin.cpp b/update-fatbin.cpp index f1db474..060e062 100644 --- a/update-fatbin.cpp +++ b/update-fatbin.cpp @@ -24,9 +24,9 @@ struct GpuBinInfo { static void showHelp(const std::string &toolName) { std::cerr << "Usage : " << toolName << " " << " " - << "" << std::endl; + << "" << " " << std::endl; std::cerr << "supported architectures : gfx900, gfx906, gfx908, gfx90a, gfx940" << std::endl; - std::cerr << "This tool create a fat binary containing an instrumented GPU binary" << std::endl; + std::cerr << "This tool creates a fat binary containing an instrumented GPU binary" << std::endl; } static void getgpuBinInfos(const std::string &fatbinPath, std::vector &infos) { @@ -99,7 +99,7 @@ static uint64_t alignUp(uint64_t value, uint64_t alignment) { } int main(int argc, char *argv[]) { - if (argc != 4) { + if (argc != 5) { showHelp(argv[0]); exit(1); } @@ -107,6 +107,7 @@ int main(int argc, char *argv[]) { std::string arch(argv[1]); std::string elfBinPath(argv[2]); std::string fatbinPath(argv[3]); + std::string outputFatbinPath(argv[4]); std::vector gpuBinInfos; getgpuBinInfos(fatbinPath, gpuBinInfos); @@ -157,7 +158,7 @@ int main(int argc, char *argv[]) { // dumpInfos(newBinInfos); // Now we create a new fatbin - std::ofstream newFatbin(fatbinPath + ".updated", std::ios::binary); + std::ofstream newFatbin(outputFatbinPath, std::ios::binary); if (!newFatbin) { std::cerr << "error : can't open new fatbin" << std::endl; exit(1); From 29771f63555383de489c3e757f194090bfc5a421 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 15:14:24 -0500 Subject: [PATCH 26/28] Update update-note to take output argument --- instr-driver | 8 +++++--- update-note.cpp | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/instr-driver b/instr-driver index 1da6005..0efb3fc 100755 --- a/instr-driver +++ b/instr-driver @@ -23,7 +23,9 @@ GPUBIN_FINAL=$GPUBIN.final NAMES_FILE=$GPUBIN.instrumentedKernelNames NOTE_IN=$GPUBIN.note -NOTE_OUT=$NOTE_IN.expanded +NOTE_OUT=$NOTE_IN.updated + +PRELOAD_NAMES=$NAMES_FILE.preload # 1. Extract fatbin. This will output a $FATBIN extract-fatbin $EXEC_IN $FATBIN @@ -48,8 +50,8 @@ llvm-objcopy --dump-section=.note=$NOTE_IN $GPUBIN # 5.2 For each instrumented kernel, modify the metadata as follows: # - Expand the kernarg buffer with 1 additional argument, which the additional memory that we will allocate via the host. # - Increase SGPR usage to 112 (GFX908 only for now) -# This will emit $NOTE_OUT. -update-note $NAMES_FILE $NOTE_IN +# Outputs $NOTE_OUT and $PRELOAD_OUT +update-note $NAMES_FILE $NOTE_IN $NOTE_OUT $PRELOAD_NAMES # 5.3 Copy the updated binary, remove the note section cp $GPUBIN_INSTR $GPUBIN_UPDATED_NOTE diff --git a/update-note.cpp b/update-note.cpp index 858f2b6..609d150 100644 --- a/update-note.cpp +++ b/update-note.cpp @@ -229,23 +229,24 @@ static void writeUpdatedKernelInfos(const std::string &filePath, } int main(int argc, char *argv[]) { - if (argc != 3) { - printf("usage expand_args <.names file> <.note file>\n"); + if (argc != 5) { + std::cerr << "usage : "<< argv[0] << " " + << " \n"; return -1; } - std::string namesFile(argv[1]); - std::string noteFile(argv[2]); - std::string updatedNoteFile(noteFile + ".expanded"); + std::string namesPath(argv[1]); + std::string notePath(argv[2]); + std::string updatedNotePath(argv[3]); + std::string outputPreloadInfoPath(argv[4]); std::vector instrumentedKernelInfos; - readInstrumentedKernelInfos(namesFile, instrumentedKernelInfos); + readInstrumentedKernelInfos(namesPath, instrumentedKernelInfos); - rewriteNotes(noteFile, updatedNoteFile, instrumentedKernelInfos); + rewriteNotes(notePath, updatedNotePath, instrumentedKernelInfos); // The preload library will read this - std::string preloadNamesFile = namesFile + ".preload"; - writeUpdatedKernelInfos(preloadNamesFile, instrumentedKernelInfos); + writeUpdatedKernelInfos(outputPreloadInfoPath, instrumentedKernelInfos); return 0; } From 0f89d8ec4a16b0145501988505ddb97d1b261689 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 15:29:31 -0500 Subject: [PATCH 27/28] Cleanup instr-driver --- instr-driver | 57 ++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/instr-driver b/instr-driver index 0efb3fc..8a1d22d 100755 --- a/instr-driver +++ b/instr-driver @@ -28,55 +28,54 @@ NOTE_OUT=$NOTE_IN.updated PRELOAD_NAMES=$NAMES_FILE.preload # 1. Extract fatbin. This will output a $FATBIN -extract-fatbin $EXEC_IN $FATBIN +extract-fatbin "$EXEC_IN" "$FATBIN" # 2. Extract gfx908 bin. This will output $GPUBIN -extract-gpubin gfx908 $FATBIN $GPUBIN +extract-gpubin gfx908 "$FATBIN" "$GPUBIN" # 3. Run the mutator, instrument kernels (also use the information from step 3). # This will also emit a file containing list of instrumented kernels ($NAMES_FILE) # $MUTATOR -procedure-count $GPUBIN -$MUTATOR $GPUBIN +"$MUTATOR" "$GPUBIN" -# 4. Update kernel descriptors for instrumented kernels -# This will produce $GPUBIN_UPDATED -# update-kd $NAMES_FILE $GPUBIN_INSTR - -# 5. Modify the note metadata +# 4. Modify the note metadata # -# 5.1 Extract the note section from original binary -llvm-objcopy --dump-section=.note=$NOTE_IN $GPUBIN +# 4.1 Extract the note section from original binary +llvm-objcopy --dump-section=.note="$NOTE_IN" "$GPUBIN" -# 5.2 For each instrumented kernel, modify the metadata as follows: -# - Expand the kernarg buffer with 1 additional argument, which the additional memory that we will allocate via the host. +# 4.2 For each instrumented kernel, modify the metadata as follows: +# - Update the kernarg signature with 1 additional argument, which is the additional memory +# that we will allocate via the host in the preload library. # - Increase SGPR usage to 112 (GFX908 only for now) -# Outputs $NOTE_OUT and $PRELOAD_OUT -update-note $NAMES_FILE $NOTE_IN $NOTE_OUT $PRELOAD_NAMES +# Outputs $NOTE_OUT and $PRELOAD_NAMES +update-note "$NAMES_FILE" "$NOTE_IN" "$NOTE_OUT" "$PRELOAD_NAMES" -# 5.3 Copy the updated binary, remove the note section -cp $GPUBIN_INSTR $GPUBIN_UPDATED_NOTE -llvm-objcopy --remove-section=.note $GPUBIN_UPDATED_NOTE +# 4.3 Copy the updated binary, remove the note section +cp "$GPUBIN_INSTR" "$GPUBIN_UPDATED_NOTE" +llvm-objcopy --remove-section=.note "$GPUBIN_UPDATED_NOTE" -# 5.4 Add the expanded note section -llvm-objcopy --add-section=.note=$NOTE_OUT $GPUBIN_UPDATED_NOTE +# 4.4 Add the expanded note section +llvm-objcopy --add-section=.note="$NOTE_OUT" "$GPUBIN_UPDATED_NOTE" -# 5.5 Update the program header for the notes section -update-note-phdr $GPUBIN_UPDATED_NOTE $GPUBIN_FINAL +# 4.5 Update the program header for the notes section +update-note-phdr "$GPUBIN_UPDATED_NOTE" "$GPUBIN_FINAL" -# 6. Update original fatbin with instrumented gpu binary ($GPUBIN_FINAL) +# 5. Update original fatbin with instrumented gpu binary ($GPUBIN_FINAL) # This will emit $FATBIN_UPDATED -update-fatbin gfx908 $GPUBIN_FINAL $FATBIN $FATBIN_UPDATED +update-fatbin gfx908 "$GPUBIN_FINAL" "$FATBIN" "$FATBIN_UPDATED" -# 7. Update the original executable ($EXEC_IN) by embedding $FATBIN_UPDATED +# 6. Update the original executable ($EXEC_IN) by embedding $FATBIN_UPDATED # This will emit $EXEC_UPDATED -update-exec $EXEC_IN $FATBIN_UPDATED $EXEC_UPDATED +update-exec "$EXEC_IN" "$FATBIN_UPDATED" "$EXEC_UPDATED" -# 8. Rename fatbin sections so that roc-obj* tools work with the modified executable. Those tools specifically look for the .hip_fatbin section by name. +# 7. Rename fatbin sections so that roc-obj* tools work with the modified executable. +# Those tools specifically look for the .hip_fatbin section by name. # - Rename .hip_fatbin section to .old_fatbin # - Rename .new_fatbin section to .hip_fatbin -# It is possible to do this within the update-exec tool, but doing it here is simpler and less error-prone -llvm-objcopy --rename-section .hip_fatbin=.old_fatbin $EXEC_UPDATED $EXEC_RENAMED1 -llvm-objcopy --rename-section .new_fatbin=.hip_fatbin $EXEC_RENAMED1 $EXEC_RENAMED2 +# It is possible to do this within the update-exec tool, but doing it here is simpler +# and less error-prone +llvm-objcopy --rename-section .hip_fatbin=.old_fatbin "$EXEC_UPDATED" "$EXEC_RENAMED1" +llvm-objcopy --rename-section .new_fatbin=.hip_fatbin "$EXEC_RENAMED1" "$EXEC_RENAMED2" cp $EXEC_RENAMED2 $EXEC_OUT From a29942b75b9faaa000bcaac96cebabeabd215e05 Mon Sep 17 00:00:00 2001 From: Ronak Chauhan Date: Fri, 1 May 2026 15:31:28 -0500 Subject: [PATCH 28/28] Fix comment in update-note --- update-note.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/update-note.cpp b/update-note.cpp index 609d150..89f39c0 100644 --- a/update-note.cpp +++ b/update-note.cpp @@ -102,7 +102,7 @@ static void rewriteNotes(const std::string &fileName, const std::string& newFile uint32_t offset = 0; // Step 2 - parse the ELF note header. This is not msgpack header. - // First 4 bytes : Size of the Name str (should be AMDGPU\0) + // First 4 bytes : Size of the Name str (should be strlen(AMDGPU\0)) // Second 4 bytes : Size of the note in msgpack format // Third 4 bytes : Type of the note (should be 32) // Followed by the Name str