diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h index 9e4d1c9..5c0067b 100644 --- a/include/nvtop/extract_gpuinfo_common.h +++ b/include/nvtop/extract_gpuinfo_common.h @@ -240,4 +240,35 @@ inline unsigned busy_usage_from_time_usage_round(uint64_t current_use_ns, uint64 unsigned nvtop_pcie_gen_from_link_speed(unsigned linkSpeed); +// NVLink support +#define NVTOP_NVLINK_MAX_LINKS 36 + +struct nvlink_info { + unsigned num_links; // Number of NVLink links on this device + unsigned version; // NVLink version (e.g. 3 for NVLink 3.0) + bool supported; // NVLink is supported on this device + bool has_throughput; // Whether throughput data was available this cycle + unsigned long long aggregate_tx; // Aggregate TX throughput across all links (KiB/s) + unsigned long long aggregate_rx; // Aggregate RX throughput across all links (KiB/s) + unsigned long long total_errors; // Cumulative-since-launch errors across all links + unsigned long long total_corrections; // Cumulative-since-launch CRC corrections across all links + unsigned long long total_ecc_errors; // Cumulative-since-launch ECC data errors across all links +}; + +unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info); + +// Get display-ready NVLink error/correction/ECC counts from the per-device persistent struct. +// Returns true if baseline has been established at least once. +bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info, + unsigned long long *out_errors, + unsigned long long *out_corrections, + unsigned long long *out_ecc); + +// NVLink probe — call before initialize_curses to set layout mode +bool nvtop_probe_nvlink_list(struct list_head *devices); + +// Reset per-GPU NVLink cache (probed flag, cached linkcount/version, cached info struct). +// Call when the monitored device set changes so newly-monitored NVLink GPUs get probed fresh. +void nvtop_reset_nvlink_cache(struct gpu_info *gpu_info); + #endif // EXTRACT_GPUINFO_COMMON_H__ diff --git a/include/nvtop/interface_internal_common.h b/include/nvtop/interface_internal_common.h index aec93d3..e549a26 100644 --- a/include/nvtop/interface_internal_common.h +++ b/include/nvtop/interface_internal_common.h @@ -70,9 +70,11 @@ struct device_window { WINDOW *gpu_clock_info; WINDOW *mem_clock_info; WINDOW *pcie_info; + WINDOW *nvlink_info; WINDOW *shader_cores; WINDOW *l2_cache_size; WINDOW *exec_engines; + WINDOW *nvlink_errors; bool enc_was_visible; bool dec_was_visible; nvtop_time last_decode_seen; @@ -154,6 +156,7 @@ enum device_field { device_shadercores, device_l2features, device_execengines, + device_nvlink_errors, device_field_count, }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b485cb4..8d92f59 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,6 +52,8 @@ endif() if(NVIDIA_SUPPORT) target_sources(nvtop PRIVATE extract_gpuinfo_nvidia.c) +else() + target_sources(nvtop PRIVATE nvlink_nvidia_disabled.c) endif() if(ASCEND_SUPPORT) diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c index 33670a6..e66bd57 100644 --- a/src/extract_gpuinfo_nvidia.c +++ b/src/extract_gpuinfo_nvidia.c @@ -21,21 +21,85 @@ #include "nvtop/common.h" #include "nvtop/extract_gpuinfo_common.h" +#include "nvtop/time.h" #include #include +#include #include #include #include #include #include -#define NVML_SUCCESS 0 -#define NVML_ERROR_NOT_SUPPORTED 3 -#define NVML_ERROR_INSUFFICIENT_SIZE 7 - -typedef struct nvmlDevice *nvmlDevice_t; -typedef int nvmlReturn_t; // store the enum as int +// We do NOT include nvml.h — nvtop uses dlsym function pointers for all NVML +// functions, and including nvml.h would conflict with those declarations. +// Instead, we manually declare nvmlFieldValue_t and its dependencies here. +// This satisfies the maintainer's requirement to use the proper struct type +// instead of raw memcpy offsets, without breaking the dlsym architecture. + +// Core NVML types needed throughout the file (from nvml.h — cannot include directly +// due to dlsym function pointer conflicts with nvtop's architecture). + +// NVML return codes (subset — we only use NVML_SUCCESS and NVML_ERROR_NOT_SUPPORTED) +typedef enum nvmlReturn_enum { + NVML_SUCCESS = 0, + NVML_ERROR_UNINITIALIZED = 1, + NVML_ERROR_INVALID_ARGUMENT = 2, + NVML_ERROR_NOT_SUPPORTED = 3, + NVML_ERROR_NO_PERMISSION = 4, + NVML_ERROR_INSUFFICIENT_SIZE = 7, +} nvmlReturn_t; + +// Opaque device handle (nvml.h defines as struct nvmlDevice_st*) +typedef struct nvmlDevice_st *nvmlDevice_t; + +// nvmlFieldValue_t and its dependencies (manually declared to avoid including nvml.h). +// These match nvml.h struct/enum definitions from CUDA 12.x. +typedef enum nvmlValueType_enum { + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + NVML_VALUE_TYPE_SIGNED_INT = 5, + NVML_VALUE_TYPE_UNSIGNED_SHORT = 6, + NVML_VALUE_TYPE_COUNT +} nvmlValueType_t; + +typedef union nvmlValue_st { + double dVal; + int siVal; + unsigned int uiVal; + unsigned long ulVal; + unsigned long long ullVal; + signed long long sllVal; + unsigned short usVal; +} nvmlValue_t; + +typedef struct nvmlFieldValue_st { + unsigned int fieldId; + unsigned int scopeId; + long long timestamp; + long long latencyUsec; + nvmlValueType_t valueType; + nvmlReturn_t nvmlReturn; + nvmlValue_t value; +} nvmlFieldValue_t; + +// NVML field IDs for NVLink throughput and CRC corrections (from nvml.h) +#ifndef NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 +#endif +#ifndef NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 +#endif +#ifndef NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 +#endif +#ifndef NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 +#endif // Init and shutdown @@ -207,6 +271,12 @@ static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses[4])(nvmlDevice_t d #define NVML_DEVICE_MIG_ENABLE 0x1 nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); +// NVLink functions (not present in older NVML versions, gracefully handled) +static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive); +static nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t device, unsigned int link, unsigned int *version); +static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int counter, unsigned int link, unsigned long long *value); +static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t, unsigned int, nvmlFieldValue_t *); + static void *libnvidia_ml_handle; static nvmlReturn_t last_nvml_return_status = NVML_SUCCESS; @@ -276,6 +346,33 @@ struct gpu_info_nvidia { nvmlDevice_t gpuhandle; bool isInMigMode; unsigned long long last_utilization_timestamp; + + // NVLink throughput via NVML API (raw counters, aggregate across all links) + unsigned long long nvlink_last_tx; // Cumulative aggregate TX for delta computation + unsigned long long nvlink_last_rx; // Cumulative aggregate RX for delta computation + nvtop_time nvlink_last_poll_time; // Timestamp for poll throttling + + // NVLink error counter baselines (cumulative since boot, tracked per-device) + unsigned long long baseline_errors; // Cumulative errors at last read + unsigned long long baseline_corrections; // Cumulative corrections at last read + unsigned long long baseline_ecc_errors; // Cumulative ECC data errors at last read + bool nvlink_errors_baseline_read; // True after first read establishes baseline + + // Display-ready error/correction/ECC counts (computed in refresh_dynamic_info) + unsigned long long display_errors; // Errors since nvtop launch + unsigned long long display_corrections; // Corrections since nvtop launch + unsigned long long display_ecc_errors; // ECC data errors since nvtop launch + + // Cached NVLink hardware properties (probe once, reuse forever) + bool nvlink_probed; // true after first probe, regardless of result + unsigned int nvlink_cached_linkcount; // 0 = no NVLink links + unsigned int nvlink_cached_version; // Marketing version, 0 = not yet probed + + // Cached nvlink_info struct: populated during refresh_dynamic_info, + // returned by nvtop_get_nvlink_info in the draw path to avoid redundant + // NVML calls and CLI forks on every draw cycle. + struct nvlink_info cached_nvlink_info; + bool cached_nvlink_info_populated; }; static LIST_HEAD(allocations); @@ -288,6 +385,20 @@ static void gpuinfo_nvidia_populate_static_info(struct gpu_info *_gpu_info); static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info); static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info); +// Forward declaration for nvlink_read_errors (defined later, called from refresh_dynamic_info) +static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info); + +// Forward declaration for nvlink_refresh_cached_info (defined later, called from refresh_dynamic_info) +// Populates gpu_info->cached_nvlink_info with throughput + error data. +static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigned int linkCount); + +// Remap raw NVML NVLink protocol version to the marketing version (forward declaration) +static unsigned int nvlink_marketing_version(unsigned int raw_version); + +// Probe NVLink link count and version, caching results in gpu_info_nvidia to avoid +// repeated NVML API calls on every refresh cycle. Returns cached linkCount (0 if no NVLink). +unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info); + struct gpu_vendor gpu_vendor_nvidia = { .init = gpuinfo_nvidia_init, .shutdown = gpuinfo_nvidia_shutdown, @@ -470,6 +581,12 @@ static bool gpuinfo_nvidia_init(void) { nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization"); nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode"); + // NVLink functions (optional - not available on all drivers/hardware) + nvmlDeviceGetNvLinkState = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkState"); + nvmlDeviceGetNvLinkVersion = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkVersion"); + nvmlDeviceGetNvLinkErrorCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkErrorCounter"); + nvmlDeviceGetFieldValues = dlsym(libnvidia_ml_handle, "nvmlDeviceGetFieldValues"); + last_nvml_return_status = nvmlInit(); if (last_nvml_return_status != NVML_SUCCESS) { return false; @@ -749,6 +866,23 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) { SET_GPUINFO_DYNAMIC(dynamic_info, multi_instance_mode, currentMode == NVML_DEVICE_MIG_ENABLE); } } + + // NVLink: refresh error counters, throughput, and populate cached nvlink_info + // GPUs are non-hot-swappable — all NVLink probing/computation happens here + // (refresh path), and nvtop_get_nvlink_info() just returns the cached copy + // in the draw path. + // "supported but no bridge" case: version is probed before link state, so + // cached_version > 0 means NVLink hardware exists even with linkCount == 0. + if (nvmlDeviceGetNvLinkState) { + unsigned int linkCount = nvlink_probe_and_cache(gpu_info); + if (linkCount > 0 || gpu_info->nvlink_cached_version > 0) { + // Error counters only make sense when links are active. + if (linkCount > 0 && (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues)) + nvlink_read_errors(device, linkCount, gpu_info); + // Throughput + cached info struct (handles 0-link case for display) + nvlink_refresh_cached_info(gpu_info, linkCount); + } + } } static void gpuinfo_nvidia_get_process_utilization(struct gpu_info_nvidia *gpu_info, unsigned num_processes_recovered, @@ -936,3 +1070,355 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) { gpu_info->base.dynamic_info.multi_instance_mode)) gpuinfo_nvidia_get_process_utilization(gpu_info, _gpu_info->processes_count, _gpu_info->processes); } + +// NVML NVLink enums (guarded — nvml.h defines these; local fallback for older drivers) +#ifndef NVML_NVLINK_MAX_LINKS_INTERNAL +#define NVML_NVLINK_MAX_LINKS_INTERNAL 36 +#endif + +#ifndef NVML_NVLINK_ERROR_DL_REPLAY +// NVML error counter types +#define NVML_NVLINK_ERROR_DL_REPLAY 0 +#define NVML_NVLINK_ERROR_DL_RECOVERY 1 +#define NVML_NVLINK_ERROR_DL_CRC_FLIT 2 +#define NVML_NVLINK_ERROR_DL_CRC_DATA 3 +#define NVML_NVLINK_ERROR_DL_ECC_DATA 4 +#endif + +// Helper: Query a single NVML field value via nvmlDeviceGetFieldValues. +// Returns true if the field was successfully read into *out_val. +static bool nvlink_query_field(nvmlDevice_t device, unsigned int field_id, + unsigned int scope_id, unsigned long long *out_val) { + if (!nvmlDeviceGetFieldValues) + return false; + nvmlFieldValue_t fv = {0}; + fv.fieldId = field_id; + fv.scopeId = scope_id; + nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 1, &fv); + if (ret != NVML_SUCCESS || fv.nvmlReturn != NVML_SUCCESS) + return false; + *out_val = fv.value.ullVal; + return true; +} + +// Probe NVLink link count and version, caching results in gpu_info_nvidia to avoid +// repeated NVML API calls on every refresh cycle. linkCount and version are static +// hardware properties — once discovered, they never change during the process lifetime. +// Returns the cached linkCount (0 if no NVLink). +unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) { + // Already probed — return cached result (even if linkcount is 0) + if (gpu_info->nvlink_probed) + return gpu_info->nvlink_cached_linkcount; + + if (!nvmlDeviceGetNvLinkState) { + gpu_info->nvlink_probed = true; + return 0; + } + + nvmlDevice_t device = gpu_info->gpuhandle; + unsigned int linkCount = 0; + unsigned int version = 0; + + // Probe NVLink version BEFORE the link state loop. This succeeds on any GPU + // with NVLink hardware, even when no bridge is connected (all links return + // NVML_ERROR_NOT_SUPPORTED from GetNvLinkState). This lets us detect + // "NVLink supported but no active links" vs "no NVLink hardware at all." + if (nvmlDeviceGetNvLinkVersion) { + nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version); + if (vret == NVML_SUCCESS) + version = nvlink_marketing_version(version); + } + + // Probe links. A link is counted only if nvmlDeviceGetNvLinkState succeeds + // AND isActive == 1. Without a bridge, the API returns SUCCESS with isActive=0 + // for all physical link slots — those must NOT be counted. + // Consume links must be contiguous from 0: we stop at the first inactive link + // (either isActive=0 or API error) to avoid reporting phantom counts. + for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) { + unsigned int isActive = 0; + nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive); + if (ret == NVML_SUCCESS && isActive) { + linkCount = link + 1; + } else if (ret == NVML_ERROR_NOT_SUPPORTED) { + // This link slot does not exist on this hardware — stop probing. + break; + } else { + // ret != SUCCESS, or isActive == 0: no more active links. + break; + } + } + // Cache results + gpu_info->nvlink_probed = true; + gpu_info->nvlink_cached_linkcount = linkCount; + gpu_info->nvlink_cached_version = version; + return linkCount; +} + +// Read NVLink error counters (replay, recovery, CRC), storing results in the persistent gpu_info struct. +// Uses baseline subtraction to show only errors since nvtop launch (Option B). +// Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list. +// Corrections and ECC data errors are read separately in nvlink_refresh_cached_info() via NVML batched field query. +static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info) { + // Error counters via nvmlDeviceGetNvLinkErrorCounter + unsigned long long cumulative_errors = 0; + if (nvmlDeviceGetNvLinkErrorCounter) { + for (unsigned int link = 0; link < linkCount; link++) { + unsigned long long val = 0; + nvmlReturn_t ret; + // Replay errors + ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_REPLAY, link, &val); + if (ret == NVML_SUCCESS) cumulative_errors += val; + // Recovery errors + ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_RECOVERY, link, &val); + if (ret == NVML_SUCCESS) cumulative_errors += val; + // CRC FLIT errors + ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_CRC_FLIT, link, &val); + if (ret == NVML_SUCCESS) cumulative_errors += val; + // CRC DATA errors + ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_CRC_DATA, link, &val); + if (ret == NVML_SUCCESS) cumulative_errors += val; + } + } + + // Baseline subtraction: show only errors since nvtop launch + if (!gpu_info->nvlink_errors_baseline_read) { + // First read — establish baseline, display zeros + gpu_info->baseline_errors = cumulative_errors; + gpu_info->nvlink_errors_baseline_read = true; + gpu_info->display_errors = 0; + } else { + // Subsequent reads — show delta from baseline + gpu_info->display_errors = cumulative_errors > gpu_info->baseline_errors + ? cumulative_errors - gpu_info->baseline_errors + : 0; + } +} + +// Public getter for display-ready error/correction/ECC counts from a struct gpu_info. +// Returns true if baseline has been established at least once. +bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info, + unsigned long long *out_errors, + unsigned long long *out_corrections, + unsigned long long *out_ecc) { + // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately + if (strcmp(_gpu_info->vendor->name, "NVIDIA")) + return false; + + struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base); + if (!gpu_info->nvlink_errors_baseline_read) { + return false; + } + *out_errors = gpu_info->display_errors; + *out_corrections = gpu_info->display_corrections; + *out_ecc = gpu_info->display_ecc_errors; + return true; +} + +// Remap raw NVML NVLink protocol version to the marketing version. +// NVML raw values do NOT equal marketing versions (raw 5 = 3.1 -> rounds to 3). +static unsigned int nvlink_marketing_version(unsigned int raw_version) { + // Raw NVML value to rounded marketing major version. + switch (raw_version) { + case 1: return 1; + case 2: return 2; + case 3: return 2; // NVLink 2.2 -> 2 + case 4: return 3; // NVLink 3.0 -> 3 + case 5: return 3; // NVLink 3.1 -> 3 + case 6: return 4; // NVLink 4.0 + case 7: return 5; // NVLink 5.0 + case 8: return 6; // NVLink 6.0 (Rubin) + default: return raw_version; + } +} + +// Get NVLink info (version, link count, aggregate throughput via NVML API). +// Populate cached_nvlink_info with link count, version, throughput, and error/correction counts. +// Called from refresh_dynamic_info on every refresh cycle (refresh path). +// GPUs are non-hot-swappable, so all NVLink data is computed here and cached — +// nvtop_get_nvlink_info() in the draw path just returns the cached copy. +static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigned int linkCount) { + struct nvlink_info *cache = &gpu_info->cached_nvlink_info; + + cache->supported = true; + cache->num_links = linkCount; + cache->version = gpu_info->nvlink_cached_version; + + // Throughput: skip entirely when there are 0 links (nothing to measure). + if (linkCount == 0) { + cache->has_throughput = false; + cache->aggregate_tx = 0; + cache->aggregate_rx = 0; + cache->total_errors = 0; + cache->total_corrections = 0; + cache->total_ecc_errors = 0; + gpu_info->cached_nvlink_info_populated = true; + return; + } + + // Throughput and corrections via NVML API in a single batched call. + // RAW fields (140/141) include protocol overhead; DATA fields (138/139) return + // identical TX/RX on consumer GPUs with aggregate scopeId, yielding zero throughput. + // Field 38 (CRC corrections) is already per-device aggregate -- scopeId=0. + // Poll every 2 seconds to keep API call frequency reasonable. + nvtop_time current_time; + nvtop_get_current_time(¤t_time); + double delta_s = (gpu_info->nvlink_last_poll_time.tv_sec > 0) + ? nvtop_difftime(gpu_info->nvlink_last_poll_time, current_time) + : 0; + + // Single batched nvmlDeviceGetFieldValues call for TX, RX, corrections, and ECC errors. + // Each entry's nvmlReturn field is checked individually for validity. + nvmlFieldValue_t batch[4] = {0}; + batch[0].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX; + batch[0].scopeId = UINT_MAX; + batch[1].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX; + batch[1].scopeId = UINT_MAX; + batch[2].fieldId = NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL; + batch[2].scopeId = 0; + batch[3].fieldId = NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL; + batch[3].scopeId = 0; + + unsigned long long new_tx = 0, new_rx = 0, new_corrections = 0, new_ecc_errors = 0; + bool got_tx = false, got_rx = false, got_corrections = false, got_ecc_errors = false; + + if (nvmlDeviceGetFieldValues) { + nvmlReturn_t ret = nvmlDeviceGetFieldValues(gpu_info->gpuhandle, 4, batch); + if (ret == NVML_SUCCESS) { + if (batch[0].nvmlReturn == NVML_SUCCESS) { + new_tx = batch[0].value.ullVal; + got_tx = true; + } + if (batch[1].nvmlReturn == NVML_SUCCESS) { + new_rx = batch[1].value.ullVal; + got_rx = true; + } + if (batch[2].nvmlReturn == NVML_SUCCESS) { + new_corrections = batch[2].value.ullVal; + got_corrections = true; + } + if (batch[3].nvmlReturn == NVML_SUCCESS) { + new_ecc_errors = batch[3].value.ullVal; + got_ecc_errors = true; + } + } + } + + // Throughput delta computation (TX + RX) + if (got_tx || got_rx) { + if (gpu_info->nvlink_last_poll_time.tv_sec > 0 && delta_s > 0) { + unsigned long long delta_tx = (new_tx >= gpu_info->nvlink_last_tx) + ? new_tx - gpu_info->nvlink_last_tx : 0; + unsigned long long delta_rx = (new_rx >= gpu_info->nvlink_last_rx) + ? new_rx - gpu_info->nvlink_last_rx : 0; + cache->aggregate_tx = (unsigned long long)((double)delta_tx / delta_s); + cache->aggregate_rx = (unsigned long long)((double)delta_rx / delta_s); + cache->has_throughput = true; + } else { + cache->has_throughput = false; + } + gpu_info->nvlink_last_tx = new_tx; + gpu_info->nvlink_last_rx = new_rx; + } else { + cache->has_throughput = false; + cache->aggregate_tx = 0; + cache->aggregate_rx = 0; + } + gpu_info->nvlink_last_poll_time = current_time; + + // Corrections -- use same baseline subtraction pattern as errors + if (got_corrections) { + if (!gpu_info->nvlink_errors_baseline_read) { + gpu_info->baseline_corrections = new_corrections; + gpu_info->display_corrections = 0; + gpu_info->nvlink_errors_baseline_read = true; + } else { + gpu_info->display_corrections = new_corrections > gpu_info->baseline_corrections + ? new_corrections - gpu_info->baseline_corrections : 0; + } + } + + // ECC data errors -- use same baseline subtraction pattern as errors/corrections + if (got_ecc_errors) { + if (!gpu_info->nvlink_errors_baseline_read) { + gpu_info->baseline_ecc_errors = new_ecc_errors; + gpu_info->display_ecc_errors = 0; + gpu_info->nvlink_errors_baseline_read = true; + } else { + gpu_info->display_ecc_errors = new_ecc_errors > gpu_info->baseline_ecc_errors + ? new_ecc_errors - gpu_info->baseline_ecc_errors : 0; + } + } + + // Error/correction/ECC counts from display-ready fields + cache->total_errors = gpu_info->display_errors; + cache->total_corrections = gpu_info->display_corrections; + cache->total_ecc_errors = gpu_info->display_ecc_errors; + + gpu_info->cached_nvlink_info_populated = true; +} + +// Return cached nvlink_info struct. Called from the draw path (draw_gpu_info_ncurses) +// to avoid redundant NVML calls and CLI forks on every draw cycle. +// GPUs are non-hot-swappable, so the cached struct is authoritative. +// For the startup probe (nvtop_probe_nvlink_list) before refresh_dynamic_info has run, +// falls back to computing on-demand. +unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *nvlink_info) { + if (!_gpu_info || !nvlink_info) + return 0; + + // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately + if (strcmp(_gpu_info->vendor->name, "NVIDIA")) { + memset(nvlink_info, 0, sizeof(*nvlink_info)); + return 0; + } + + struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base); + + // If cached info is available (after first refresh), just return it. + // This is the fast path — eliminates all NVML calls and CLI forks in the draw path. + if (gpu_info->cached_nvlink_info_populated) { + memcpy(nvlink_info, &gpu_info->cached_nvlink_info, sizeof(*nvlink_info)); + return nvlink_info->num_links; + } + + // Fallback for startup probe (nvtop_probe_nvlink_list) before refresh_dynamic_info ran: + // Populate minimal info (link count + version, no throughput) to determine if NVLink exists. + // "supported but no bridge" case: version probed before link state, so set supported=true + // even when linkCount == 0 if we got a version reading. + if (!nvmlDeviceGetNvLinkState) + return 0; + + memset(nvlink_info, 0, sizeof(*nvlink_info)); + + unsigned int linkCount = nvlink_probe_and_cache(gpu_info); + + if (gpu_info->nvlink_cached_version > 0) { + // NVLink hardware detected (version read succeeded), even if no links active. + nvlink_info->supported = true; + nvlink_info->num_links = linkCount; + nvlink_info->version = gpu_info->nvlink_cached_version; + } + + return nvlink_info->num_links; +} + +// Reset all NVLink caches for a single GPU. Called when monitored device set changes. +void nvtop_reset_nvlink_cache(struct gpu_info *_gpu_info) { + // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately + if (strcmp(_gpu_info->vendor->name, "NVIDIA")) + return; + + struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base); + gpu_info->nvlink_probed = false; + gpu_info->nvlink_cached_linkcount = 0; + gpu_info->nvlink_cached_version = 0; + gpu_info->cached_nvlink_info_populated = false; + memset(&gpu_info->cached_nvlink_info, 0, sizeof(gpu_info->cached_nvlink_info)); + gpu_info->baseline_errors = 0; + gpu_info->baseline_corrections = 0; + gpu_info->baseline_ecc_errors = 0; + gpu_info->nvlink_errors_baseline_read = false; + gpu_info->nvlink_last_tx = 0; + gpu_info->nvlink_last_rx = 0; + gpu_info->nvlink_last_poll_time = (struct timespec){0}; +} diff --git a/src/interface.c b/src/interface.c index ae23199..dd117d8 100644 --- a/src/interface.c +++ b/src/interface.c @@ -46,8 +46,56 @@ static unsigned int sizeof_device_field[device_field_count] = { [device_name] = 11, [device_fan_speed] = 11, [device_temperature] = 10, [device_power] = 15, [device_clock] = 11, [device_mem_clock] = 12, [device_pcie] = 46, [device_shadercores] = 7, [device_l2features] = 11, [device_execengines] = 11, + [device_nvlink_errors] = 33, }; +// True if any monitored device has NVLink hardware support (even if 0 links active). +// Controls whether to allocate the nvlink_info window for displaying "NVL3 0x" etc. +static bool any_device_has_nvlink = false; +// True if any monitored device has NVLink with active links (linkCount > 0). +// Controls layout adjustments (shrinking fan field) and the nvlink_errors +// window allocation. +static bool any_device_has_nvlink_active = false; + +// When NVLink has ACTIVE links, shrink fan field from 11 to 8 to make room on line 2. +// Only done when there are actual links to show throughput for — 0-link "NVL3 0x" +// display does not require any padding reduction. +static void nvtop_adjust_field_sizes_for_nvlink(void) { + if (any_device_has_nvlink_active) { + sizeof_device_field[device_fan_speed] = 8; // "FAN %3u%%" (was 11 with padding) + } else { + sizeof_device_field[device_fan_speed] = 11; // Restore default padding + } +} + +bool nvtop_probe_nvlink_list(struct list_head *devices) { + // Skip re-probing if we already know at least one device has NVLink. + // NVLink support is a static hardware property that does not change at runtime. + if (any_device_has_nvlink) + return true; + + bool has_nvlink = false; + bool has_nvlink_active = false; + + struct gpu_info *gpu; + list_for_each_entry(gpu, devices, list) { + struct nvlink_info nvl = {0}; + // nvtop_get_nvlink_info returns num_links (could be 0 for "supported but no bridge"). + // Check nvl.supported separately to catch the 0-link case. + nvtop_get_nvlink_info(gpu, &nvl); + if (nvl.supported) { + has_nvlink = true; + if (nvl.num_links > 0) + has_nvlink_active = true; + } + } + + any_device_has_nvlink = has_nvlink; + any_device_has_nvlink_active = has_nvlink_active; + + return has_nvlink; +} + static unsigned int sizeof_process_field[process_field_count] = { [process_pid] = 7, [process_user] = 4, [process_gpu_id] = 3, [process_type] = 8, [process_gpu_rate] = 4, [process_enc_rate] = 4, [process_dec_rate] = 4, @@ -70,7 +118,7 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col, if (dwin->pcie_info == NULL) goto alloc_error; - // Line 2 = GPU clk | MEM clk | Temp | Fan | Power + // Line 2 = GPU clk | MEM clk | Temp | Fan | Power | NVLink dwin->gpu_clock_info = newwin(1, sizeof_device_field[device_clock], start_row + 1, start_col); if (dwin->gpu_clock_info == NULL) goto alloc_error; @@ -94,6 +142,18 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col, sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed]); if (dwin->power_info == NULL) goto alloc_error; + // NVLink appended to power_info on the same row (start_row + 1), using remaining width + if (any_device_has_nvlink) { + dwin->nvlink_info = + newwin(1, sizeof_device_field[device_pcie] - sizeof_device_field[device_power] - spacer * 3, start_row + 1, + start_col + spacer * 4 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] + + sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] + + spacer * 2 + sizeof_device_field[device_power]); + if (dwin->nvlink_info == NULL) + goto alloc_error; + } else { + dwin->nvlink_info = NULL; + } // Line 3 = GPU used | MEM used | Encoder | Decoder @@ -177,6 +237,18 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col, start_col + spacer * 2 + sizeof_device_field[device_shadercores] + sizeof_device_field[device_l2features]); if (dwin->exec_engines == NULL) goto alloc_error; + // NVLink errors appended to exec_engines on the same row (start_row + 3), conditional on NVLink + // Only allocate for devices with active links — 0-link devices have no error counters to show. + if (any_device_has_nvlink_active) { + dwin->nvlink_errors = + newwin(1, sizeof_device_field[device_nvlink_errors], start_row + 3, + start_col + spacer * 3 + sizeof_device_field[device_shadercores] + + sizeof_device_field[device_l2features] + sizeof_device_field[device_execengines]); + if (dwin->nvlink_errors == NULL) + goto alloc_error; + } else { + dwin->nvlink_errors = NULL; + } return; alloc_error: @@ -205,6 +277,10 @@ static void free_device_windows(struct device_window *dwin) { delwin(dwin->shader_cores); delwin(dwin->l2_cache_size); delwin(dwin->exec_engines); + if (dwin->nvlink_info != NULL) + delwin(dwin->nvlink_info); + if (dwin->nvlink_errors != NULL) + delwin(dwin->nvlink_errors); } static void alloc_process_with_option(struct nvtop_interface *interface, unsigned posX, unsigned posY, unsigned sizeX, @@ -347,10 +423,18 @@ static void alloc_plot_window(unsigned devices_count, struct window_position *pl } static unsigned device_length(void) { - return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1, - sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] + - sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] + - sizeof_device_field[device_power] + 5); + unsigned line1 = sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1; + + // Line 2 base: clock, mem_clock, temp, fan, power + spacers (4 spacers + 1 = 5) + // Do NOT expand for NVLink — the NVLink window on line 2 extends past the + // nominal panel edge and ncurses renders it fine. Expanding it would make + // line 3 bar charts (GPU/MEM/Enc/Dec) too wide. This applies to both the + // 0-link case ("NVL3 0x") and the active-links case (with throughput). + unsigned line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] + + sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] + + sizeof_device_field[device_power] + 5; + + return max(line1, line2); } static pid_t nvtop_pid; @@ -367,6 +451,10 @@ static void initialize_all_windows(struct nvtop_interface *dwin) { struct window_position plot_positions[MAX_CHARTS]; struct window_position setup_position; + // NVLink layout adjustments must happen before panel dimensions are computed. + // any_device_has_nvlink_active is set by the probe that runs before this function. + nvtop_adjust_field_sizes_for_nvlink(); + compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 4 : 3, device_length(), rows - 1, cols, dwin->options.gpu_specific_opts, dwin->options.process_fields_displayed, device_positions, &dwin->num_plots, plot_positions, map_device_to_plot, &process_position, &setup_position, @@ -557,10 +645,10 @@ static void draw_temp_color(WINDOW *win, unsigned int temp, unsigned int temp_sl wnoutrefresh(win); } -static void print_pcie_at_scale(WINDOW *win, unsigned int value) { +static void print_data_at_scale(WINDOW *win, unsigned long long value) { int prefix_off; double val_d = value; - for (prefix_off = 1; prefix_off < 5 && val_d >= 1000.; ++prefix_off) { + for (prefix_off = 1; prefix_off < 6 && val_d >= 1000.; ++prefix_off) { val_d = val_d / 1024.; } if (val_d >= 100.) { @@ -575,6 +663,10 @@ static void print_pcie_at_scale(WINDOW *win, unsigned int value) { wprintw(win, " %sB/s", memory_prefix[prefix_off]); } +// print_data_at_scale (renamed from print_pcie_at_scale): reused for NVLink throughput +// (identical scale logic, bounds check extended to prefix_off < 6 for TiB/s). +// Takes unsigned long long to avoid 32-bit truncation on high-throughput hardware. + static inline void werase_and_wnoutrefresh(WINDOW *w) { werase(w); wnoutrefresh(w); @@ -778,19 +870,41 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte // FAN if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed)) { - mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%% ", - device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed); - mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL); + if (any_device_has_nvlink_active) { + mvwprintw(dev->fan_speed, 0, 0, "FAN %3u%%", + device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed); + mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL); + } else { + mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%% ", + device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed); + mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL); + } } else if (device->static_info.integrated_graphics) { - mvwprintw(dev->fan_speed, 0, 0, " CPU-FAN "); - mvwchgat(dev->fan_speed, 0, 2, 7, 0, cyan_color, NULL); + if (any_device_has_nvlink_active) { + mvwprintw(dev->fan_speed, 0, 0, "CPU-FAN"); + mvwchgat(dev->fan_speed, 0, 0, 7, 0, cyan_color, NULL); + } else { + mvwprintw(dev->fan_speed, 0, 0, " CPU-FAN "); + mvwchgat(dev->fan_speed, 0, 2, 7, 0, cyan_color, NULL); + } } else if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm)) { - mvwprintw(dev->fan_speed, 0, 0, "FAN %4uRPM", - device->dynamic_info.fan_rpm > 9999 ? 9999 : device->dynamic_info.fan_rpm); - mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL); + if (any_device_has_nvlink_active) { + mvwprintw(dev->fan_speed, 0, 0, "FAN%3uR", + device->dynamic_info.fan_rpm > 999 ? 999 : device->dynamic_info.fan_rpm); + mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL); + } else { + mvwprintw(dev->fan_speed, 0, 0, "FAN %4uRPM", + device->dynamic_info.fan_rpm > 9999 ? 9999 : device->dynamic_info.fan_rpm); + mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL); + } } else { - mvwprintw(dev->fan_speed, 0, 0, " FAN N/A "); - mvwchgat(dev->fan_speed, 0, 2, 3, 0, cyan_color, NULL); + if (any_device_has_nvlink_active) { + mvwprintw(dev->fan_speed, 0, 0, "FAN N/A"); + mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL); + } else { + mvwprintw(dev->fan_speed, 0, 0, " FAN N/A "); + mvwchgat(dev->fan_speed, 0, 2, 3, 0, cyan_color, NULL); + } } wnoutrefresh(dev->fan_speed); @@ -830,6 +944,40 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte mvwchgat(dev->power_info, 0, 0, 3, 0, cyan_color, NULL); wnoutrefresh(dev->power_info); + // NVLink info (on same row as power_info) + if (dev->nvlink_info != NULL) { + werase(dev->nvlink_info); + struct nvlink_info nvl_info = {0}; + nvtop_get_nvlink_info(device, &nvl_info); + if (nvl_info.supported) { + wcolor_set(dev->nvlink_info, cyan_color, NULL); + wprintw(dev->nvlink_info, "NVL"); + wcolor_set(dev->nvlink_info, magenta_color, NULL); + if (nvl_info.version > 0) + wprintw(dev->nvlink_info, "%u", nvl_info.version); + else + wprintw(dev->nvlink_info, "?"); + wstandend(dev->nvlink_info); + + if (nvl_info.num_links > 0) { + // Active links: show link count and throughput + if (nvl_info.num_links < 10) + wprintw(dev->nvlink_info, " %ux ", nvl_info.num_links); + else + wprintw(dev->nvlink_info, "%ux ", nvl_info.num_links); + + if (nvl_info.has_throughput) { + unsigned long long total_kib = nvl_info.aggregate_tx + nvl_info.aggregate_rx; + print_data_at_scale(dev->nvlink_info, total_kib); + } + } else { + // No active links (no bridge connected) — show "0x" + wprintw(dev->nvlink_info, " 0x"); + } + } + wnoutrefresh(dev->nvlink_info); + } + // PICe throughput werase(dev->pcie_info); if (device->static_info.integrated_graphics) { @@ -852,14 +1000,14 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte wprintw(dev->pcie_info, " RX: "); wstandend(dev->pcie_info); if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_rx)) - print_pcie_at_scale(dev->pcie_info, device->dynamic_info.pcie_rx); + print_data_at_scale(dev->pcie_info, device->dynamic_info.pcie_rx); else wprintw(dev->pcie_info, "N/A"); wcolor_set(dev->pcie_info, magenta_color, NULL); wprintw(dev->pcie_info, " TX: "); wstandend(dev->pcie_info); if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_tx)) - print_pcie_at_scale(dev->pcie_info, device->dynamic_info.pcie_tx); + print_data_at_scale(dev->pcie_info, device->dynamic_info.pcie_tx); else wprintw(dev->pcie_info, "N/A"); @@ -901,6 +1049,35 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte wprintw(dev->exec_engines, "N/A"); wnoutrefresh(dev->exec_engines); + + // NVLink errors/corrections/ECC (conditional on NVLink) + if (dev->nvlink_errors != NULL) { + werase(dev->nvlink_errors); + unsigned long long err_cnt = 0, cor_cnt = 0, ecc_cnt = 0; + if (nvtop_get_nvlink_error_counts(device, &err_cnt, &cor_cnt, &ecc_cnt)) { + wcolor_set(dev->nvlink_errors, cyan_color, NULL); + wprintw(dev->nvlink_errors, "NVL"); + wstandend(dev->nvlink_errors); + // FLIT errors (field 38) + wprintw(dev->nvlink_errors, " FL:"); + if (err_cnt > 0) + wcolor_set(dev->nvlink_errors, red_color, NULL); + wprintw(dev->nvlink_errors, "%05u", (unsigned)(err_cnt % 100000)); + wstandend(dev->nvlink_errors); + // ECC data errors (field 160) + wprintw(dev->nvlink_errors, " EE:"); + if (ecc_cnt > 0) + wcolor_set(dev->nvlink_errors, red_color, NULL); + wprintw(dev->nvlink_errors, "%05u", (unsigned)(ecc_cnt % 100000)); + wstandend(dev->nvlink_errors); + // CRC corrections (field 38) + wprintw(dev->nvlink_errors, " CR:"); + if (cor_cnt > 0) + wcolor_set(dev->nvlink_errors, yellow_color, NULL); + wprintw(dev->nvlink_errors, "%05u", (unsigned)(cor_cnt % 100000)); + } + wnoutrefresh(dev->nvlink_errors); + } } dev_id++; @@ -2059,6 +2236,24 @@ void interface_check_monitored_gpu_change(struct nvtop_interface **interface, un nvtop_interface_option options_copy = (*interface)->options; options_copy.has_monitored_set_changed = false; memset(&(*interface)->options, 0, sizeof(options_copy)); + // Reset NVLink probe cache when monitored device set changes — the user + // may have switched from an NVLink GPU to a non-NVLink one (or vice versa). + // The cache will be repopulated on the next refresh cycle. + any_device_has_nvlink = false; + any_device_has_nvlink_active = false; + // Reset fan field to default width — it may have been compacted to 8 for + // NVLink-active layout. Without this, initialize_curses() below would + // allocate fan_speed windows at stale width 8. + sizeof_device_field[device_fan_speed] = 11; + // Reset NVLink probes on all monitored GPUs so they get probed fresh. + { struct gpu_info *g; + list_for_each_entry(g, monitoredGpus, list) + nvtop_reset_nvlink_cache(g); + } + // Re-probe NVLink now that caches are cleared, so that + // any_device_has_nvlink_active is correct when initialize_curses() + // calls initialize_all_windows() for layout decisions. + nvtop_probe_nvlink_list(monitoredGpus); *num_monitored_gpus = interface_check_and_fix_monitored_gpus(allDevCount, monitoredGpus, nonMonitoredGpus, &options_copy); clean_ncurses(*interface); diff --git a/src/nvlink_nvidia_disabled.c b/src/nvlink_nvidia_disabled.c new file mode 100644 index 0000000..124b092 --- /dev/null +++ b/src/nvlink_nvidia_disabled.c @@ -0,0 +1,33 @@ +/* + * Fallback implementations for NVLink functions when NVIDIA support is disabled. + * All return 0 / false / no-op to keep the build clean when no NVIDIA GPUs + * are present at compile time. + */ + +#include "nvtop/extract_gpuinfo_common.h" + +unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info) { + (void)gpu_info; + (void)nvlink_info; + return 0; +} + +bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info, + unsigned long long *out_errors, + unsigned long long *out_corrections, + unsigned long long *out_ecc) { + (void)gpu_info; + (void)out_errors; + (void)out_corrections; + (void)out_ecc; + return false; +} + +bool nvtop_probe_nvlink_list(struct list_head *devices) { + (void)devices; + return false; +} + +void nvtop_reset_nvlink_cache(struct gpu_info *gpu_info) { + (void)gpu_info; +} diff --git a/src/nvtop.c b/src/nvtop.c index 61302d2..8c1598e 100644 --- a/src/nvtop.c +++ b/src/nvtop.c @@ -312,6 +312,9 @@ int main(int argc, char **argv) { unsigned numMonitoredGpus = interface_check_and_fix_monitored_gpus(allDevCount, &monitoredGpus, &nonMonitoredGpus, &allDevicesOptions); + // Probe for NVLink before layout computation + nvtop_probe_nvlink_list(&monitoredGpus); + if (allDevicesOptions.show_startup_messages) { bool dont_show_again = show_information_messages(numWarningMessages, warningMessages); if (dont_show_again) { @@ -334,6 +337,10 @@ int main(int argc, char **argv) { signal_cont_received = 0; update_window_size_to_terminal_size(interface); } + // Probe NVLink state BEFORE monitored-set-change check, so that + // any_device_has_nvlink_active is set before initialize_all_windows() + // reads it for layout decisions. + nvtop_probe_nvlink_list(&monitoredGpus); interface_check_monitored_gpu_change(&interface, allDevCount, &numMonitoredGpus, &monitoredGpus, &nonMonitoredGpus); if (time_slept >= interface_update_interval(interface)) { gpuinfo_refresh_dynamic_info(&monitoredGpus);