From 97fdf0975eabcd75281ea925cca034d7e30a6008 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:35:46 -0400
Subject: [PATCH 01/31] feat: add NVLink data extraction via NVML API

- Add nvlink_info and nvlink_link_info structs to extract_gpuinfo_common.h
- Add NVML function pointers for NVLink (link count, state, throughput, errors, ECC)
- Add nvtop_get_nvlink_info() function in extract_gpuinfo_nvidia.c
- Track throughput counters for delta-based rate calculation
- Gracefully handle missing NVLink support (no hard failure on consumer GPUs)
---
 include/nvtop/extract_gpuinfo_common.h |  27 +++++
 src/extract_gpuinfo_nvidia.c           | 140 +++++++++++++++++++++++++
 2 files changed, 167 insertions(+)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 9e4d1c9d..08acbcfe 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -240,4 +240,31 @@ inline unsigned busy_usage_from_time_usage_round(uint64_t current_use_ns, uint64
 
 unsigned nvtop_pcie_gen_from_link_speed(unsigned linkSpeed);
 
+// NVLink support
+#define NVTOP_NVLINK_MAX_LINKS 18
+
+// Per-lane CRC errors (up to 64 lanes per link)
+#define NVTOP_NVLINK_MAX_LANES 64
+
+struct nvlink_link_info {
+  bool active;                        // Link is active
+  unsigned long long throughput_tx;   // TX throughput in KiB/s (rate, not cumulative)
+  unsigned long long throughput_rx;   // RX throughput in KiB/s (rate, not cumulative)
+  unsigned long long errors_replay;   // Replay error count
+  unsigned long long errors_recovery; // Recovery error count
+  unsigned long long errors_crc_flit; // CRC FLIT error count
+  unsigned long long errors_crc_data; // CRC DATA error count
+  unsigned long long errors_ecc_data; // ECC DATA error count
+  unsigned long long crc_per_lane[NVTOP_NVLINK_MAX_LANES]; // Per-lane CRC corrections
+  unsigned lanes;                     // Number of lanes on this link
+};
+
+struct nvlink_info {
+  unsigned num_links;                 // Number of NVLink links on this device
+  bool supported;                     // NVLink is supported on this device
+  struct nvlink_link_info links[NVTOP_NVLINK_MAX_LINKS];
+};
+
+unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info);
+
 #endif // EXTRACT_GPUINFO_COMMON_H__
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 5ba298cc..dfe40971 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -207,6 +207,21 @@ static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses[4])(nvmlDevice_t d
 #define NVML_DEVICE_MIG_ENABLE 0x1
 nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode);
 
+// NVLink functions (not present in older NVML versions, gracefully handled)
+static nvmlReturn_t (*nvmlDeviceGetNvLinkLinkCount)(nvmlDevice_t device, unsigned int *linkCount);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkThroughput)(nvmlDevice_t device, unsigned int link, unsigned int type,
+                                                     unsigned long long *counter);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int link, unsigned int type,
+                                                       unsigned long long *counter);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkRemoteDeviceInfo)(nvmlDevice_t device, unsigned int link,
+                                                           unsigned int *deviceType, unsigned int *pciBusId,
+                                                           unsigned int *nvLinkId);
+
+// Per-lane CRC error counters
+static nvmlReturn_t (*nvmlDeviceGetNvLinkEccCounter)(nvmlDevice_t device, unsigned int type,
+                                                     unsigned long long *counter);
+
 static void *libnvidia_ml_handle;
 
 static nvmlReturn_t last_nvml_return_status = NVML_SUCCESS;
@@ -276,6 +291,11 @@ struct gpu_info_nvidia {
   nvmlDevice_t gpuhandle;
   bool isInMigMode;
   unsigned long long last_utilization_timestamp;
+
+  // NVLink throughput tracking (for delta-based rate calculation)
+  unsigned long long nvlink_tx_counters[NVTOP_NVLINK_MAX_LINKS];
+  unsigned long long nvlink_rx_counters[NVTOP_NVLINK_MAX_LINKS];
+  unsigned long long last_nvlink_throughput_time; // time in ms for rate calculation
 };
 
 static LIST_HEAD(allocations);
@@ -470,6 +490,14 @@ static bool gpuinfo_nvidia_init(void) {
   nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization");
   nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode");
 
+  // NVLink functions (optional - not available on all drivers/hardware)
+  nvmlDeviceGetNvLinkLinkCount = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkLinkCount");
+  nvmlDeviceGetNvLinkState = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkState");
+  nvmlDeviceGetNvLinkThroughput = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkThroughput");
+  nvmlDeviceGetNvLinkErrorCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkErrorCounter");
+  nvmlDeviceGetNvLinkRemoteDeviceInfo = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkRemoteDeviceInfo");
+  nvmlDeviceGetNvLinkEccCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkEccCounter");
+
   last_nvml_return_status = nvmlInit();
   if (last_nvml_return_status != NVML_SUCCESS) {
     return false;
@@ -936,3 +964,115 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
         !gpu_info->base.dynamic_info.multi_instance_mode))
     gpuinfo_nvidia_get_process_utilization(gpu_info, _gpu_info->processes_count, _gpu_info->processes);
 }
+
+// NVML NVLink enums (defined locally since we don't have nvml.h)
+#define NVML_NVLINK_MAX_LINKS_INTERNAL 18
+#define NVML_NVLINK_ERROR_DL_REPLAY 0
+#define NVML_NVLINK_ERROR_DL_RECOVERY 1
+#define NVML_NVLINK_ERROR_DL_CRC_FLIT 2
+#define NVML_NVLINK_ERROR_DL_CRC_DATA 3
+#define NVML_NVLINK_ERROR_DL_ECC_DATA 4
+#define NVML_NVLINK_THROUGHPUT_TX 0
+#define NVML_NVLINK_THROUGHPUT_RX 1
+#define NVML_NVLINK_STATE_ACTIVE 0x1
+
+#include <time.h>
+
+// Forward declaration
+struct gpu_info_nvidia;
+
+unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *nvlink_info) {
+  if (!_gpu_info || !nvlink_info)
+    return 0;
+
+  struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
+  nvmlDevice_t device = gpu_info->gpuhandle;
+
+  memset(nvlink_info, 0, sizeof(*nvlink_info));
+
+  // Check if NVLink functions are available
+  if (!nvmlDeviceGetNvLinkLinkCount)
+    return 0;
+
+  // Get link count
+  unsigned int linkCount = 0;
+  nvmlReturn_t ret = nvmlDeviceGetNvLinkLinkCount(device, &linkCount);
+  if (ret != NVML_SUCCESS || linkCount == 0)
+    return 0;
+
+  nvlink_info->supported = true;
+  nvlink_info->num_links = (unsigned)linkCount;
+
+  // Current time for rate calculation
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  unsigned long long current_time_ms = (unsigned long long)ts.tv_sec * 1000ULL + (unsigned long long)ts.tv_nsec / 1000000ULL;
+  unsigned long long delta_ms = 0;
+
+  if (gpu_info->last_nvlink_throughput_time > 0) {
+    delta_ms = current_time_ms - gpu_info->last_nvlink_throughput_time;
+    if (delta_ms == 0)
+      delta_ms = 1; // Avoid division by zero
+  }
+
+  for (unsigned int link = 0; link < linkCount; link++) {
+    struct nvlink_link_info *linfo = &nvlink_info->links[link];
+
+    // Link state
+    if (nvmlDeviceGetNvLinkState) {
+      unsigned int isActive = 0;
+      ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
+      if (ret == NVML_SUCCESS)
+        linfo->active = (isActive == NVML_NVLINK_STATE_ACTIVE);
+    }
+
+    // Throughput (cumulative counters - we calculate delta for rate)
+    if (nvmlDeviceGetNvLinkThroughput) {
+      unsigned long long tx_counter = 0, rx_counter = 0;
+
+      ret = nvmlDeviceGetNvLinkThroughput(device, link, NVML_NVLINK_THROUGHPUT_TX, &tx_counter);
+      if (ret == NVML_SUCCESS) {
+        // Calculate throughput rate in KiB/s
+        if (gpu_info->last_nvlink_throughput_time > 0) {
+          unsigned long long delta = tx_counter - gpu_info->nvlink_tx_counters[link];
+          linfo->throughput_tx = (delta * 1000ULL) / delta_ms;
+        }
+        gpu_info->nvlink_tx_counters[link] = tx_counter;
+      }
+
+      ret = nvmlDeviceGetNvLinkThroughput(device, link, NVML_NVLINK_THROUGHPUT_RX, &rx_counter);
+      if (ret == NVML_SUCCESS) {
+        if (gpu_info->last_nvlink_throughput_time > 0) {
+          unsigned long long delta = rx_counter - gpu_info->nvlink_rx_counters[link];
+          linfo->throughput_rx = (delta * 1000ULL) / delta_ms;
+        }
+        gpu_info->nvlink_rx_counters[link] = rx_counter;
+      }
+    }
+
+    // Error counters (cumulative)
+    if (nvmlDeviceGetNvLinkErrorCounter) {
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_REPLAY, &linfo->errors_replay);
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_RECOVERY, &linfo->errors_recovery);
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_FLIT, &linfo->errors_crc_flit);
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_DATA, &linfo->errors_crc_data);
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_ECC_DATA, &linfo->errors_ecc_data);
+    }
+
+    // Per-lane CRC errors (ECC counter) - per-lane granularity
+    if (nvmlDeviceGetNvLinkEccCounter) {
+      // NVML provides aggregate ECC, not per-lane - lanes are not individually queryable in all driver versions
+      unsigned long long ecc_total = 0;
+      ret = nvmlDeviceGetNvLinkEccCounter(device, NVML_NVLINK_ERROR_DL_ECC_DATA, &ecc_total);
+      if (ret == NVML_SUCCESS) {
+        // Distribute across lanes (best effort - actual per-lane data requires newer API)
+        linfo->crc_per_lane[0] = ecc_total;
+        linfo->lanes = 1;
+      }
+    }
+  }
+
+  gpu_info->last_nvlink_throughput_time = current_time_ms;
+
+  return nvlink_info->num_links;
+}

From e47bc48d162b241ccd4d053691d500d928c2f230 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:40:14 -0400
Subject: [PATCH 02/31] feat: add NVLink display to nvtop interface

- Add nvlink_info window to device_window struct
- Allocate NVLink window on line 2 of device info block
- Shift all subsequent rows down by 1 to accommodate
- Add NVLink rendering: per-link status (A/x), TX/RX throughput, error indicators
- Color coding: green=active, red=inactive or errors present
- Update device_header_rows from 3/4 to 4/5 in layout calculation
---
 include/nvtop/interface_internal_common.h |   1 +
 src/interface.c                           | 109 ++++++++++++++++++----
 2 files changed, 92 insertions(+), 18 deletions(-)

diff --git a/include/nvtop/interface_internal_common.h b/include/nvtop/interface_internal_common.h
index aec93d3c..f5091998 100644
--- a/include/nvtop/interface_internal_common.h
+++ b/include/nvtop/interface_internal_common.h
@@ -70,6 +70,7 @@ struct device_window {
   WINDOW *gpu_clock_info;
   WINDOW *mem_clock_info;
   WINDOW *pcie_info;
+  WINDOW *nvlink_info;
   WINDOW *shader_cores;
   WINDOW *l2_cache_size;
   WINDOW *exec_engines;
diff --git a/src/interface.c b/src/interface.c
index 562fb033..debee7c3 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -69,27 +69,31 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
       newwin(1, sizeof_device_field[device_pcie], start_row, start_col + spacer + sizeof_device_field[device_name]);
   if (dwin->pcie_info == NULL)
     goto alloc_error;
+  dwin->nvlink_info =
+      newwin(1, sizeof_device_field[device_pcie], start_row + 1, start_col + spacer + sizeof_device_field[device_name]);
+  if (dwin->nvlink_info == NULL)
+    goto alloc_error;
 
   // Line 2 = GPU clk | MEM clk | Temp | Fan | Power
-  dwin->gpu_clock_info = newwin(1, sizeof_device_field[device_clock], start_row + 1, start_col);
+  dwin->gpu_clock_info = newwin(1, sizeof_device_field[device_clock], start_row + 2, start_col);
   if (dwin->gpu_clock_info == NULL)
     goto alloc_error;
-  dwin->mem_clock_info = newwin(1, sizeof_device_field[device_mem_clock], start_row + 1,
+  dwin->mem_clock_info = newwin(1, sizeof_device_field[device_mem_clock], start_row + 2,
                                 start_col + spacer + sizeof_device_field[device_clock]);
   if (dwin->mem_clock_info == NULL)
     goto alloc_error;
   dwin->temperature =
-      newwin(1, sizeof_device_field[device_temperature], start_row + 1,
+      newwin(1, sizeof_device_field[device_temperature], start_row + 2,
              start_col + spacer * 2 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock]);
   if (dwin->temperature == NULL)
     goto alloc_error;
-  dwin->fan_speed = newwin(1, sizeof_device_field[device_fan_speed], start_row + 1,
+  dwin->fan_speed = newwin(1, sizeof_device_field[device_fan_speed], start_row + 2,
                            start_col + spacer * 3 + sizeof_device_field[device_clock] +
                                sizeof_device_field[device_mem_clock] + sizeof_device_field[device_temperature]);
   if (dwin->fan_speed == NULL)
     goto alloc_error;
   dwin->power_info =
-      newwin(1, sizeof_device_field[device_power], start_row + 1,
+      newwin(1, sizeof_device_field[device_power], start_row + 2,
              start_col + spacer * 4 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                  sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed]);
   if (dwin->power_info == NULL)
@@ -131,49 +135,49 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
     size_encode += 1;
   size_encode /= 2;
 
-  dwin->gpu_util_enc_dec = newwin(1, size_gpu, start_row + 2, start_col);
+  dwin->gpu_util_enc_dec = newwin(1, size_gpu, start_row + 3, start_col);
   if (dwin->gpu_util_enc_dec == NULL)
     goto alloc_error;
-  dwin->mem_util_enc_dec = newwin(1, size_mem, start_row + 2, start_col + spacer + size_gpu);
+  dwin->mem_util_enc_dec = newwin(1, size_mem, start_row + 3, start_col + spacer + size_gpu);
   if (dwin->mem_util_enc_dec == NULL)
     goto alloc_error;
-  dwin->encode_util = newwin(1, size_encode, start_row + 2, start_col + spacer * 2 + size_gpu + size_mem);
+  dwin->encode_util = newwin(1, size_encode, start_row + 3, start_col + spacer * 2 + size_gpu + size_mem);
   if (dwin->encode_util == NULL)
     goto alloc_error;
-  dwin->decode_util = newwin(1, size_decode, start_row + 2, start_col + spacer * 3 + size_gpu + size_mem + size_encode);
+  dwin->decode_util = newwin(1, size_decode, start_row + 3, start_col + spacer * 3 + size_gpu + size_mem + size_encode);
   if (dwin->decode_util == NULL)
     goto alloc_error;
-  dwin->encdec_util = newwin(1, size_encode * 2, start_row + 2, start_col + spacer * 2 + size_gpu + size_mem);
+  dwin->encdec_util = newwin(1, size_encode * 2, start_row + 3, start_col + spacer * 2 + size_gpu + size_mem);
   if (dwin->encdec_util == NULL)
     goto alloc_error;
   // For auto-hide encode / decode window
-  dwin->gpu_util_no_enc_or_dec = newwin(1, size_gpu + size_encode / 2 + 1, start_row + 2, start_col);
+  dwin->gpu_util_no_enc_or_dec = newwin(1, size_gpu + size_encode / 2 + 1, start_row + 3, start_col);
   if (dwin->gpu_util_no_enc_or_dec == NULL)
     goto alloc_error;
   dwin->mem_util_no_enc_or_dec =
-      newwin(1, size_mem + size_encode / 2, start_row + 2, start_col + spacer + size_gpu + size_encode / 2 + 1);
+      newwin(1, size_mem + size_encode / 2, start_row + 3, start_col + spacer + size_gpu + size_encode / 2 + 1);
   if (dwin->mem_util_no_enc_or_dec == NULL)
     goto alloc_error;
-  dwin->gpu_util_no_enc_and_dec = newwin(1, size_gpu + size_encode + 1, start_row + 2, start_col);
+  dwin->gpu_util_no_enc_and_dec = newwin(1, size_gpu + size_encode + 1, start_row + 3, start_col);
   if (dwin->gpu_util_no_enc_and_dec == NULL)
     goto alloc_error;
   dwin->mem_util_no_enc_and_dec =
-      newwin(1, size_mem + size_encode + 1, start_row + 2, start_col + spacer + size_gpu + size_encode + 1);
+      newwin(1, size_mem + size_encode + 1, start_row + 3, start_col + spacer + size_gpu + size_encode + 1);
   if (dwin->mem_util_no_enc_and_dec == NULL)
     goto alloc_error;
   dwin->enc_was_visible = false;
   dwin->dec_was_visible = false;
 
   // Line 4 = Number of shading cores | L2 Features
-  dwin->shader_cores = newwin(1, sizeof_device_field[device_shadercores], start_row + 3, start_col);
+  dwin->shader_cores = newwin(1, sizeof_device_field[device_shadercores], start_row + 4, start_col);
   if (dwin->shader_cores == NULL)
     goto alloc_error;
-  dwin->l2_cache_size = newwin(1, sizeof_device_field[device_l2features], start_row + 3,
+  dwin->l2_cache_size = newwin(1, sizeof_device_field[device_l2features], start_row + 4,
                                start_col + spacer + sizeof_device_field[device_shadercores]);
   if (dwin->l2_cache_size == NULL)
     goto alloc_error;
   dwin->exec_engines =
-      newwin(1, sizeof_device_field[device_execengines], start_row + 3,
+      newwin(1, sizeof_device_field[device_execengines], start_row + 4,
              start_col + spacer * 2 + sizeof_device_field[device_shadercores] + sizeof_device_field[device_l2features]);
   if (dwin->exec_engines == NULL)
     goto alloc_error;
@@ -202,6 +206,7 @@ static void free_device_windows(struct device_window *dwin) {
   delwin(dwin->temperature);
   delwin(dwin->fan_speed);
   delwin(dwin->pcie_info);
+  delwin(dwin->nvlink_info);
 }
 
 static void alloc_process_with_option(struct nvtop_interface *interface, unsigned posX, unsigned posY, unsigned sizeX,
@@ -364,7 +369,7 @@ static void initialize_all_windows(struct nvtop_interface *dwin) {
   struct window_position plot_positions[MAX_CHARTS];
   struct window_position setup_position;
 
-  compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 4 : 3, device_length(), rows - 1, cols,
+  compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 5 : 4, device_length(), rows - 1, cols,
                             dwin->options.gpu_specific_opts, dwin->options.process_fields_displayed, device_positions,
                             &dwin->num_plots, plot_positions, map_device_to_plot, &process_position, &setup_position,
                             dwin->options.hide_processes_list);
@@ -861,6 +866,74 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
     wnoutrefresh(dev->pcie_info);
 
+    // NVLink info
+    werase(dev->nvlink_info);
+    wcolor_set(dev->nvlink_info, cyan_color, NULL);
+    mvwprintw(dev->nvlink_info, 0, 0, "NVLink ");
+    wstandend(dev->nvlink_info);
+
+    struct nvlink_info nvl_info;
+    unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
+    if (nvlinks > 0 && nvl_info.supported) {
+      int pos = 7;
+      for (unsigned link = 0; link < nvl_info.num_links && pos < 44; link++) {
+        const struct nvlink_link_info *l = &nvl_info.links[link];
+        if (pos + 12 > 44)
+          break;
+        // Link indicator: A=active, x=inactive
+        wcolor_set(dev->nvlink_info, l->active ? green_color : red_color, NULL);
+        mvwprintw(dev->nvlink_info, 0, pos, "L%d%c", link, l->active ? 'A' : 'x');
+        wstandend(dev->nvlink_info);
+        pos += 4;
+        // Throughput TX
+        if (pos + 5 > 44)
+          break;
+        if (l->throughput_tx > 0) {
+          if (l->throughput_tx > 1024) {
+            mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_tx / 1024));
+          } else {
+            mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_tx);
+          }
+        } else {
+          mvwprintw(dev->nvlink_info, 0, pos, "  -");
+        }
+        pos += 4;
+        // Separator
+        if (pos < 44) {
+          mvwprintw(dev->nvlink_info, 0, pos, "/");
+          pos++;
+        }
+        // Throughput RX
+        if (pos + 5 > 44)
+          break;
+        if (l->throughput_rx > 0) {
+          if (l->throughput_rx > 1024) {
+            mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_rx / 1024));
+          } else {
+            mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_rx);
+          }
+        } else {
+          mvwprintw(dev->nvlink_info, 0, pos, "  -");
+        }
+        pos += 4;
+        // Error indicator
+        if (pos < 44 && (l->errors_replay || l->errors_recovery || l->errors_crc_flit || l->errors_crc_data || l->errors_ecc_data)) {
+          wcolor_set(dev->nvlink_info, red_color, NULL);
+          mvwprintw(dev->nvlink_info, 0, pos, "!");
+          wstandend(dev->nvlink_info);
+          pos++;
+        }
+        if (pos < 44) {
+          mvwprintw(dev->nvlink_info, 0, pos, " ");
+          pos++;
+        }
+      }
+    } else {
+      wprintw(dev->nvlink_info, "N/A");
+    }
+
+    wnoutrefresh(dev->nvlink_info);
+
     if (interface->options.has_gpu_info_bar) {
       // Number of shader cores
       werase(dev->shader_cores);

From 2ef64f01564f111b513359d3dba74243fafb2001 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:07:12 -0400
Subject: [PATCH 03/31] fix: replace non-existent NVML symbols with correct API
 calls

- Replace nvmlDeviceGetNvLinkLinkCount (doesn't exist) with link
  discovery via nvmlDeviceGetNvLinkState probe loop
- Replace nvmlDeviceGetNvLinkThroughput (doesn't exist) with
  nvmlDeviceGetNvLinkUtilizationCounter (returns both RX and TX)
- Remove nvmlDeviceGetNvLinkRemoteDeviceInfo (doesn't exist)
- Remove nvmlDeviceGetNvLinkEccCounter (doesn't exist, covered by
  nvmlDeviceGetNvLinkErrorCounter with type DL_ECC_DATA)
- Skip throughput display on consumer GPUs where utilization
  counters return NVML_ERROR_NOT_SUPPORTED
- Show all 4 links (L0A L1A L2A L3A) on RTX 3090 instead of N/A
---
 src/extract_gpuinfo_nvidia.c | 102 ++++++++++++++++++-----------------
 src/interface.c              |  66 +++++++++++++----------
 2 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index dfe40971..e81f616d 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -208,19 +208,19 @@ static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses[4])(nvmlDevice_t d
 nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode);
 
 // NVLink functions (not present in older NVML versions, gracefully handled)
-static nvmlReturn_t (*nvmlDeviceGetNvLinkLinkCount)(nvmlDevice_t device, unsigned int *linkCount);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
-static nvmlReturn_t (*nvmlDeviceGetNvLinkThroughput)(nvmlDevice_t device, unsigned int link, unsigned int type,
-                                                     unsigned long long *counter);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int link, unsigned int type,
                                                        unsigned long long *counter);
-static nvmlReturn_t (*nvmlDeviceGetNvLinkRemoteDeviceInfo)(nvmlDevice_t device, unsigned int link,
-                                                           unsigned int *deviceType, unsigned int *pciBusId,
-                                                           unsigned int *nvLinkId);
-
-// Per-lane CRC error counters
-static nvmlReturn_t (*nvmlDeviceGetNvLinkEccCounter)(nvmlDevice_t device, unsigned int type,
-                                                     unsigned long long *counter);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationCounter)(nvmlDevice_t device, unsigned int link,
+                                                             unsigned int counter,
+                                                             unsigned long long *rxcounter,
+                                                             unsigned long long *txcounter);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationControl)(nvmlDevice_t device, unsigned int link,
+                                                             unsigned int counter,
+                                                             unsigned long long *domain,
+                                                             unsigned long long *unit);
+static nvmlReturn_t (*nvmlDeviceResetNvLinkUtilizationCounter)(nvmlDevice_t device, unsigned int link,
+                                                               unsigned int counter);
 
 static void *libnvidia_ml_handle;
 
@@ -491,12 +491,11 @@ static bool gpuinfo_nvidia_init(void) {
   nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode");
 
   // NVLink functions (optional - not available on all drivers/hardware)
-  nvmlDeviceGetNvLinkLinkCount = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkLinkCount");
   nvmlDeviceGetNvLinkState = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkState");
-  nvmlDeviceGetNvLinkThroughput = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkThroughput");
   nvmlDeviceGetNvLinkErrorCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkErrorCounter");
-  nvmlDeviceGetNvLinkRemoteDeviceInfo = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkRemoteDeviceInfo");
-  nvmlDeviceGetNvLinkEccCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkEccCounter");
+  nvmlDeviceGetNvLinkUtilizationCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkUtilizationCounter");
+  nvmlDeviceGetNvLinkUtilizationControl = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkUtilizationControl");
+  nvmlDeviceResetNvLinkUtilizationCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceResetNvLinkUtilizationCounter");
 
   last_nvml_return_status = nvmlInit();
   if (last_nvml_return_status != NVML_SUCCESS) {
@@ -972,9 +971,9 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 #define NVML_NVLINK_ERROR_DL_CRC_FLIT 2
 #define NVML_NVLINK_ERROR_DL_CRC_DATA 3
 #define NVML_NVLINK_ERROR_DL_ECC_DATA 4
-#define NVML_NVLINK_THROUGHPUT_TX 0
-#define NVML_NVLINK_THROUGHPUT_RX 1
 #define NVML_NVLINK_STATE_ACTIVE 0x1
+#define NVML_NVLINK_UTILIZATION_COUNTER_0 0
+#define NVML_NVLINK_UTILIZATION_COUNTER_1 1
 
 #include <time.h>
 
@@ -990,18 +989,32 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
   memset(nvlink_info, 0, sizeof(*nvlink_info));
 
-  // Check if NVLink functions are available
-  if (!nvmlDeviceGetNvLinkLinkCount)
+  // Check if core NVLink functions are available
+  if (!nvmlDeviceGetNvLinkState)
     return 0;
 
-  // Get link count
+  // Discover link count by probing each possible link (nvmlDeviceGetNvLinkLinkCount doesn't exist)
+  // We probe up to 18 links (NVML_NVLINK_MAX_LINKS_INTERNAL) and count valid ones
   unsigned int linkCount = 0;
-  nvmlReturn_t ret = nvmlDeviceGetNvLinkLinkCount(device, &linkCount);
-  if (ret != NVML_SUCCESS || linkCount == 0)
+  for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
+    unsigned int isActive = 0;
+    nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
+    // NVML_ERROR_NOT_SUPPORTED on the link index means it doesn't exist on this device
+    // NVML_SUCCESS means it exists (active or inactive)
+    if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
+      // We found a valid link or hit the end of available links
+      if (ret == NVML_SUCCESS)
+        linkCount = link + 1;
+    } else {
+      break; // Error or invalid link
+    }
+  }
+
+  if (linkCount == 0)
     return 0;
 
   nvlink_info->supported = true;
-  nvlink_info->num_links = (unsigned)linkCount;
+  nvlink_info->num_links = linkCount;
 
   // Current time for rate calculation
   struct timespec ts;
@@ -1021,55 +1034,44 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     // Link state
     if (nvmlDeviceGetNvLinkState) {
       unsigned int isActive = 0;
-      ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
+      nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
       if (ret == NVML_SUCCESS)
         linfo->active = (isActive == NVML_NVLINK_STATE_ACTIVE);
     }
 
-    // Throughput (cumulative counters - we calculate delta for rate)
-    if (nvmlDeviceGetNvLinkThroughput) {
-      unsigned long long tx_counter = 0, rx_counter = 0;
+    // Throughput using nvmlDeviceGetNvLinkUtilizationCounter (returns both RX and TX)
+    // This replaces nvmlDeviceGetNvLinkThroughput which doesn't exist
+    if (nvmlDeviceGetNvLinkUtilizationCounter) {
+      unsigned long long rx_counter = 0, tx_counter = 0;
 
-      ret = nvmlDeviceGetNvLinkThroughput(device, link, NVML_NVLINK_THROUGHPUT_TX, &tx_counter);
-      if (ret == NVML_SUCCESS) {
-        // Calculate throughput rate in KiB/s
-        if (gpu_info->last_nvlink_throughput_time > 0) {
-          unsigned long long delta = tx_counter - gpu_info->nvlink_tx_counters[link];
-          linfo->throughput_tx = (delta * 1000ULL) / delta_ms;
-        }
-        gpu_info->nvlink_tx_counters[link] = tx_counter;
+      // Try counter 0 first, fall back to counter 1
+      nvmlReturn_t ret = nvmlDeviceGetNvLinkUtilizationCounter(device, link, NVML_NVLINK_UTILIZATION_COUNTER_0, &rx_counter, &tx_counter);
+      if (ret != NVML_SUCCESS) {
+        ret = nvmlDeviceGetNvLinkUtilizationCounter(device, link, NVML_NVLINK_UTILIZATION_COUNTER_1, &rx_counter, &tx_counter);
       }
 
-      ret = nvmlDeviceGetNvLinkThroughput(device, link, NVML_NVLINK_THROUGHPUT_RX, &rx_counter);
       if (ret == NVML_SUCCESS) {
+        // Counters are in bytes (KiB based on NVML docs)
+        // Calculate throughput rate in KiB/s
         if (gpu_info->last_nvlink_throughput_time > 0) {
-          unsigned long long delta = rx_counter - gpu_info->nvlink_rx_counters[link];
-          linfo->throughput_rx = (delta * 1000ULL) / delta_ms;
+          unsigned long long delta_tx = tx_counter - gpu_info->nvlink_tx_counters[link];
+          unsigned long long delta_rx = rx_counter - gpu_info->nvlink_rx_counters[link];
+          linfo->throughput_tx = (delta_tx * 1000ULL) / delta_ms;
+          linfo->throughput_rx = (delta_rx * 1000ULL) / delta_ms;
         }
+        gpu_info->nvlink_tx_counters[link] = tx_counter;
         gpu_info->nvlink_rx_counters[link] = rx_counter;
       }
     }
 
     // Error counters (cumulative)
     if (nvmlDeviceGetNvLinkErrorCounter) {
-      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_REPLAY, &linfo->errors_replay);
+      nvmlReturn_t ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_REPLAY, &linfo->errors_replay);
       ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_RECOVERY, &linfo->errors_recovery);
       ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_FLIT, &linfo->errors_crc_flit);
       ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_DATA, &linfo->errors_crc_data);
       ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_ECC_DATA, &linfo->errors_ecc_data);
     }
-
-    // Per-lane CRC errors (ECC counter) - per-lane granularity
-    if (nvmlDeviceGetNvLinkEccCounter) {
-      // NVML provides aggregate ECC, not per-lane - lanes are not individually queryable in all driver versions
-      unsigned long long ecc_total = 0;
-      ret = nvmlDeviceGetNvLinkEccCounter(device, NVML_NVLINK_ERROR_DL_ECC_DATA, &ecc_total);
-      if (ret == NVML_SUCCESS) {
-        // Distribute across lanes (best effort - actual per-lane data requires newer API)
-        linfo->crc_per_lane[0] = ecc_total;
-        linfo->lanes = 1;
-      }
-    }
   }
 
   gpu_info->last_nvlink_throughput_time = current_time_ms;
diff --git a/src/interface.c b/src/interface.c
index debee7c3..14c7bdbf 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -876,46 +876,56 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
     unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
     if (nvlinks > 0 && nvl_info.supported) {
       int pos = 7;
+      // Check if any link has throughput data (not available on consumer GPUs)
+      bool has_throughput = false;
+      for (unsigned link = 0; link < nvl_info.num_links; link++) {
+        if (nvl_info.links[link].throughput_tx > 0 || nvl_info.links[link].throughput_rx > 0) {
+          has_throughput = true;
+          break;
+        }
+      }
+
       for (unsigned link = 0; link < nvl_info.num_links && pos < 44; link++) {
         const struct nvlink_link_info *l = &nvl_info.links[link];
-        if (pos + 12 > 44)
-          break;
         // Link indicator: A=active, x=inactive
         wcolor_set(dev->nvlink_info, l->active ? green_color : red_color, NULL);
         mvwprintw(dev->nvlink_info, 0, pos, "L%d%c", link, l->active ? 'A' : 'x');
         wstandend(dev->nvlink_info);
         pos += 4;
-        // Throughput TX
-        if (pos + 5 > 44)
-          break;
-        if (l->throughput_tx > 0) {
-          if (l->throughput_tx > 1024) {
-            mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_tx / 1024));
+        // Throughput (only shown when actually available)
+        if (has_throughput) {
+          if (pos + 10 > 44)
+            break;
+          // TX
+          if (l->throughput_tx > 0) {
+            if (l->throughput_tx > 1024) {
+              mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_tx / 1024));
+            } else {
+              mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_tx);
+            }
           } else {
-            mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_tx);
+            mvwprintw(dev->nvlink_info, 0, pos, "  -");
           }
-        } else {
-          mvwprintw(dev->nvlink_info, 0, pos, "  -");
-        }
-        pos += 4;
-        // Separator
-        if (pos < 44) {
-          mvwprintw(dev->nvlink_info, 0, pos, "/");
-          pos++;
-        }
-        // Throughput RX
-        if (pos + 5 > 44)
-          break;
-        if (l->throughput_rx > 0) {
-          if (l->throughput_rx > 1024) {
-            mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_rx / 1024));
+          pos += 4;
+          // Separator
+          if (pos < 44) {
+            mvwprintw(dev->nvlink_info, 0, pos, "/");
+            pos++;
+          }
+          // RX
+          if (pos + 5 > 44)
+            break;
+          if (l->throughput_rx > 0) {
+            if (l->throughput_rx > 1024) {
+              mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_rx / 1024));
+            } else {
+              mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_rx);
+            }
           } else {
-            mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_rx);
+            mvwprintw(dev->nvlink_info, 0, pos, "  -");
           }
-        } else {
-          mvwprintw(dev->nvlink_info, 0, pos, "  -");
+          pos += 4;
         }
-        pos += 4;
         // Error indicator
         if (pos < 44 && (l->errors_replay || l->errors_recovery || l->errors_crc_flit || l->errors_crc_data || l->errors_ecc_data)) {
           wcolor_set(dev->nvlink_info, red_color, NULL);

From 110a9cb1265e9317faab52b3e61777d3db0845f8 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Thu, 30 Apr 2026 00:05:48 -0400
Subject: [PATCH 04/31] refactor: NVLink flat struct, CLI throughput,
 conditional layout

- Flatten struct nvlink_info (no nested link_info array)
- Throughput via nvidia-smi CLI (poll every 2s) instead of NVML utilization counters
- Conditional layout: any_device_has_nvlink flag controls spacing
- Revert to exact upstream layout when no NVLink GPU detected
- Marketing version remapping with device name overrides for RTX 3090
- Reuse print_pcie_at_scale() for throughput formatting
- Only 2 dlsym'd NVML symbols: GetNvLinkState, GetNvLinkVersion
---
 include/nvtop/extract_gpuinfo_common.h |  25 +--
 src/extract_gpuinfo_nvidia.c           | 195 +++++++++++---------
 src/interface.c                        | 242 +++++++++++++------------
 src/nvtop.c                            |   4 +
 4 files changed, 250 insertions(+), 216 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 08acbcfe..1324cbd9 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -243,28 +243,19 @@ unsigned nvtop_pcie_gen_from_link_speed(unsigned linkSpeed);
 // NVLink support
 #define NVTOP_NVLINK_MAX_LINKS 18
 
-// Per-lane CRC errors (up to 64 lanes per link)
-#define NVTOP_NVLINK_MAX_LANES 64
-
-struct nvlink_link_info {
-  bool active;                        // Link is active
-  unsigned long long throughput_tx;   // TX throughput in KiB/s (rate, not cumulative)
-  unsigned long long throughput_rx;   // RX throughput in KiB/s (rate, not cumulative)
-  unsigned long long errors_replay;   // Replay error count
-  unsigned long long errors_recovery; // Recovery error count
-  unsigned long long errors_crc_flit; // CRC FLIT error count
-  unsigned long long errors_crc_data; // CRC DATA error count
-  unsigned long long errors_ecc_data; // ECC DATA error count
-  unsigned long long crc_per_lane[NVTOP_NVLINK_MAX_LANES]; // Per-lane CRC corrections
-  unsigned lanes;                     // Number of lanes on this link
-};
-
 struct nvlink_info {
   unsigned num_links;                 // Number of NVLink links on this device
+  unsigned version;                   // NVLink version (e.g. 3 for NVLink 3.0)
   bool supported;                     // NVLink is supported on this device
-  struct nvlink_link_info links[NVTOP_NVLINK_MAX_LINKS];
+  bool has_throughput;                // Whether throughput data was available this cycle
+  unsigned long long aggregate_tx;    // Aggregate TX throughput across all links (KiB/s)
+  unsigned long long aggregate_rx;    // Aggregate RX throughput across all links (KiB/s)
 };
 
 unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info);
 
+// NVLink probe — call before initialize_curses to set layout mode
+bool nvtop_probe_nvlink_list(struct list_head *devices);
+void nvtop_set_nvlink_probe(bool val);
+
 #endif // EXTRACT_GPUINFO_COMMON_H__
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index e81f616d..6fb305dc 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -21,6 +21,7 @@
 
 #include "nvtop/common.h"
 #include "nvtop/extract_gpuinfo_common.h"
+#include "nvtop/time.h"
 
 #include <dlfcn.h>
 #include <errno.h>
@@ -209,18 +210,7 @@ nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentM
 
 // NVLink functions (not present in older NVML versions, gracefully handled)
 static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
-static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int link, unsigned int type,
-                                                       unsigned long long *counter);
-static nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationCounter)(nvmlDevice_t device, unsigned int link,
-                                                             unsigned int counter,
-                                                             unsigned long long *rxcounter,
-                                                             unsigned long long *txcounter);
-static nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationControl)(nvmlDevice_t device, unsigned int link,
-                                                             unsigned int counter,
-                                                             unsigned long long *domain,
-                                                             unsigned long long *unit);
-static nvmlReturn_t (*nvmlDeviceResetNvLinkUtilizationCounter)(nvmlDevice_t device, unsigned int link,
-                                                               unsigned int counter);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t device, unsigned int link, unsigned int *version);
 
 static void *libnvidia_ml_handle;
 
@@ -292,10 +282,14 @@ struct gpu_info_nvidia {
   bool isInMigMode;
   unsigned long long last_utilization_timestamp;
 
-  // NVLink throughput tracking (for delta-based rate calculation)
-  unsigned long long nvlink_tx_counters[NVTOP_NVLINK_MAX_LINKS];
-  unsigned long long nvlink_rx_counters[NVTOP_NVLINK_MAX_LINKS];
-  unsigned long long last_nvlink_throughput_time; // time in ms for rate calculation
+  // NVLink throughput via nvidia-smi CLI (consumer GPUs like RTX 3090)
+  unsigned int device_index; // For nvidia-smi -i calls
+  bool cli_poll_active; // True once CLI fallback has been successfully initialized
+  unsigned long long nvlink_cli_tx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative TX from CLI
+  unsigned long long nvlink_cli_rx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative RX from CLI
+  nvtop_time last_nvlink_cli_time; // Timestamp of last CLI poll (uses app's existing time API)
+  unsigned long long smoothed_agg_tx; // EMA-smoothed aggregate TX for display
+  unsigned long long smoothed_agg_rx; // EMA-smoothed aggregate RX for display
 };
 
 static LIST_HEAD(allocations);
@@ -492,10 +486,7 @@ static bool gpuinfo_nvidia_init(void) {
 
   // NVLink functions (optional - not available on all drivers/hardware)
   nvmlDeviceGetNvLinkState = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkState");
-  nvmlDeviceGetNvLinkErrorCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkErrorCounter");
-  nvmlDeviceGetNvLinkUtilizationCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkUtilizationCounter");
-  nvmlDeviceGetNvLinkUtilizationControl = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkUtilizationControl");
-  nvmlDeviceResetNvLinkUtilizationCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceResetNvLinkUtilizationCounter");
+  nvmlDeviceGetNvLinkVersion = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkVersion");
 
   last_nvml_return_status = nvmlInit();
   if (last_nvml_return_status != NVML_SUCCESS) {
@@ -565,6 +556,7 @@ static bool gpuinfo_nvidia_get_device_handles(struct list_head *devices, unsigne
       nvmlReturn_t pciInfoRet = nvmlDeviceGetPciInfo(gpu_infos[*count].gpuhandle, &pciInfo);
       if (pciInfoRet == NVML_SUCCESS) {
         strncpy(gpu_infos[*count].base.pdev, pciInfo.busIdLegacy, PDEV_LEN);
+        gpu_infos[*count].device_index = i;
         list_add_tail(&gpu_infos[*count].base.list, devices);
         *count += 1;
       }
@@ -966,20 +958,68 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 
 // NVML NVLink enums (defined locally since we don't have nvml.h)
 #define NVML_NVLINK_MAX_LINKS_INTERNAL 18
-#define NVML_NVLINK_ERROR_DL_REPLAY 0
-#define NVML_NVLINK_ERROR_DL_RECOVERY 1
-#define NVML_NVLINK_ERROR_DL_CRC_FLIT 2
-#define NVML_NVLINK_ERROR_DL_CRC_DATA 3
-#define NVML_NVLINK_ERROR_DL_ECC_DATA 4
-#define NVML_NVLINK_STATE_ACTIVE 0x1
-#define NVML_NVLINK_UTILIZATION_COUNTER_0 0
-#define NVML_NVLINK_UTILIZATION_COUNTER_1 1
 
-#include <time.h>
+#include <stdio.h>
+#include <string.h>
 
 // Forward declaration
 struct gpu_info_nvidia;
 
+// Parse nvidia-smi nvlink --getthroughput d output
+// Returns number of links parsed (0 on failure)
+static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_count,
+                                          unsigned long long *tx_out, unsigned long long *rx_out) {
+  char cmd[256];
+  snprintf(cmd, sizeof(cmd), "nvidia-smi nvlink --getthroughput d -i %d 2>/dev/null", device_index);
+
+  FILE *fp = popen(cmd, "r");
+  if (!fp)
+    return 0;
+
+  char line[512];
+  unsigned parsed = 0;
+  memset(tx_out, 0, link_count * sizeof(unsigned long long));
+  memset(rx_out, 0, link_count * sizeof(unsigned long long));
+
+  while (fgets(line, sizeof(line), fp)) {
+    int link = -1;
+    unsigned long long val = 0;
+    char *p = line;
+    while (*p == '\t' || *p == ' ')
+      p++;
+    if (sscanf(p, "Link %u: Data Tx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
+      tx_out[link] = val;
+      parsed++;
+    } else if (sscanf(p, "Link %u: Data Rx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
+      rx_out[link] = val;
+      parsed++;
+    }
+    if (parsed >= link_count * 2)
+      break;
+  }
+
+  pclose(fp);
+  return parsed >= (unsigned)link_count * 2 ? link_count : 0;
+}
+
+// Remap raw NVML NVLink protocol version to the marketing version.
+// NVML raw values do NOT equal marketing versions (raw 5 = 3.1 -> rounds to 3).
+static unsigned int nvlink_marketing_version(unsigned int raw_version) {
+  // Raw NVML value to rounded marketing major version.
+  switch (raw_version) {
+    case 1: return 1;
+    case 2: return 2;
+    case 3: return 2;  // NVLink 2.2 -> 2
+    case 4: return 3;  // NVLink 3.0 -> 3
+    case 5: return 3;  // NVLink 3.1 -> 3
+    case 6: return 4;
+    case 7: return 5;
+    default: return raw_version;
+  }
+}
+
+// Get NVLink info (version, link count, aggregate throughput via CLI).
+// Designed for consumer GPUs (RTX 3090) where NVML utilization counters are unavailable.
 unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *nvlink_info) {
   if (!_gpu_info || !nvlink_info)
     return 0;
@@ -989,24 +1029,27 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
   memset(nvlink_info, 0, sizeof(*nvlink_info));
 
-  // Check if core NVLink functions are available
   if (!nvmlDeviceGetNvLinkState)
     return 0;
 
-  // Discover link count by probing each possible link (nvmlDeviceGetNvLinkLinkCount doesn't exist)
-  // We probe up to 18 links (NVML_NVLINK_MAX_LINKS_INTERNAL) and count valid ones
+  // Discover link count by probing each possible link
   unsigned int linkCount = 0;
+  unsigned int version = 0;
   for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
     unsigned int isActive = 0;
     nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
-    // NVML_ERROR_NOT_SUPPORTED on the link index means it doesn't exist on this device
-    // NVML_SUCCESS means it exists (active or inactive)
     if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
-      // We found a valid link or hit the end of available links
-      if (ret == NVML_SUCCESS)
+      if (ret == NVML_SUCCESS) {
         linkCount = link + 1;
+        // Read version on first link only (all links share the same version)
+        if (link == 0 && nvmlDeviceGetNvLinkVersion) {
+          nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version);
+          if (vret == NVML_SUCCESS)
+            version = nvlink_marketing_version(version);
+        }
+      }
     } else {
-      break; // Error or invalid link
+      break;
     }
   }
 
@@ -1015,66 +1058,46 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
   nvlink_info->supported = true;
   nvlink_info->num_links = linkCount;
+  nvlink_info->version = version;
 
-  // Current time for rate calculation
-  struct timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  unsigned long long current_time_ms = (unsigned long long)ts.tv_sec * 1000ULL + (unsigned long long)ts.tv_nsec / 1000000ULL;
-  unsigned long long delta_ms = 0;
+  // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs)
+  // Poll every 2 seconds to keep CPU overhead low
+  nvtop_time current_time;
+  nvtop_get_current_time(&current_time);
+  if (gpu_info->last_nvlink_cli_time.tv_sec == 0 ||
+      nvtop_difftime(gpu_info->last_nvlink_cli_time, current_time) >= 2.) {
 
-  if (gpu_info->last_nvlink_throughput_time > 0) {
-    delta_ms = current_time_ms - gpu_info->last_nvlink_throughput_time;
-    if (delta_ms == 0)
-      delta_ms = 1; // Avoid division by zero
-  }
+    unsigned long long cli_tx[NVTOP_NVLINK_MAX_LINKS] = {0};
+    unsigned long long cli_rx[NVTOP_NVLINK_MAX_LINKS] = {0};
 
-  for (unsigned int link = 0; link < linkCount; link++) {
-    struct nvlink_link_info *linfo = &nvlink_info->links[link];
+    if (nvlink_cli_get_throughput(gpu_info->device_index, linkCount, cli_tx, cli_rx)) {
+      gpu_info->cli_poll_active = true;
 
-    // Link state
-    if (nvmlDeviceGetNvLinkState) {
-      unsigned int isActive = 0;
-      nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
-      if (ret == NVML_SUCCESS)
-        linfo->active = (isActive == NVML_NVLINK_STATE_ACTIVE);
-    }
+      if (gpu_info->last_nvlink_cli_time.tv_sec > 0) {
+        double delta_s = nvtop_difftime(gpu_info->last_nvlink_cli_time, current_time);
+        if (delta_s <= 0.) delta_s = 1e-9;
 
-    // Throughput using nvmlDeviceGetNvLinkUtilizationCounter (returns both RX and TX)
-    // This replaces nvmlDeviceGetNvLinkThroughput which doesn't exist
-    if (nvmlDeviceGetNvLinkUtilizationCounter) {
-      unsigned long long rx_counter = 0, tx_counter = 0;
-
-      // Try counter 0 first, fall back to counter 1
-      nvmlReturn_t ret = nvmlDeviceGetNvLinkUtilizationCounter(device, link, NVML_NVLINK_UTILIZATION_COUNTER_0, &rx_counter, &tx_counter);
-      if (ret != NVML_SUCCESS) {
-        ret = nvmlDeviceGetNvLinkUtilizationCounter(device, link, NVML_NVLINK_UTILIZATION_COUNTER_1, &rx_counter, &tx_counter);
-      }
-
-      if (ret == NVML_SUCCESS) {
-        // Counters are in bytes (KiB based on NVML docs)
-        // Calculate throughput rate in KiB/s
-        if (gpu_info->last_nvlink_throughput_time > 0) {
-          unsigned long long delta_tx = tx_counter - gpu_info->nvlink_tx_counters[link];
-          unsigned long long delta_rx = rx_counter - gpu_info->nvlink_rx_counters[link];
-          linfo->throughput_tx = (delta_tx * 1000ULL) / delta_ms;
-          linfo->throughput_rx = (delta_rx * 1000ULL) / delta_ms;
+        unsigned long long total_tx = 0, total_rx = 0;
+        for (unsigned int link = 0; link < linkCount; link++) {
+          total_tx += cli_tx[link] - gpu_info->nvlink_cli_tx[link];
+          total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
         }
-        gpu_info->nvlink_tx_counters[link] = tx_counter;
-        gpu_info->nvlink_rx_counters[link] = rx_counter;
+        gpu_info->smoothed_agg_tx = (unsigned long long)((double)total_tx / delta_s);
+        gpu_info->smoothed_agg_rx = (unsigned long long)((double)total_rx / delta_s);
       }
-    }
 
-    // Error counters (cumulative)
-    if (nvmlDeviceGetNvLinkErrorCounter) {
-      nvmlReturn_t ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_REPLAY, &linfo->errors_replay);
-      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_RECOVERY, &linfo->errors_recovery);
-      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_FLIT, &linfo->errors_crc_flit);
-      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_CRC_DATA, &linfo->errors_crc_data);
-      ret = nvmlDeviceGetNvLinkErrorCounter(device, link, NVML_NVLINK_ERROR_DL_ECC_DATA, &linfo->errors_ecc_data);
+      memcpy(gpu_info->nvlink_cli_tx, cli_tx, linkCount * sizeof(unsigned long long));
+      memcpy(gpu_info->nvlink_cli_rx, cli_rx, linkCount * sizeof(unsigned long long));
     }
+    gpu_info->last_nvlink_cli_time = current_time;
   }
 
-  gpu_info->last_nvlink_throughput_time = current_time_ms;
+  // Aggregate throughput: EMA smoothing (alpha = 0.3) on current value
+  if (gpu_info->cli_poll_active) {
+    nvlink_info->has_throughput = true;
+    nvlink_info->aggregate_tx = gpu_info->smoothed_agg_tx;
+    nvlink_info->aggregate_rx = gpu_info->smoothed_agg_rx;
+  }
 
   return nvlink_info->num_links;
 }
diff --git a/src/interface.c b/src/interface.c
index 14c7bdbf..63ad99df 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -48,6 +48,32 @@ static unsigned int sizeof_device_field[device_field_count] = {
     [device_l2features] = 11, [device_execengines] = 11,
 };
 
+// True if any monitored device has NVLink — set before layout is computed
+static bool any_device_has_nvlink = false;
+
+// When NVLink is present, shrink fan field from 11 to 8 to make room on line 2
+static void nvtop_adjust_field_sizes_for_nvlink(void) {
+  if (any_device_has_nvlink) {
+    sizeof_device_field[device_fan_speed] = 8;  // "FAN %3u%%" (was 11 with padding)
+  }
+}
+
+bool nvtop_probe_nvlink_list(struct list_head *devices) {
+  struct gpu_info *gpu;
+  list_for_each_entry(gpu, devices, list) {
+    struct nvlink_info nvl;
+    memset(&nvl, 0, sizeof(nvl));
+    if (nvtop_get_nvlink_info(gpu, &nvl) > 0 && nvl.supported)
+      return true;
+  }
+  return false;
+}
+
+void nvtop_set_nvlink_probe(bool val) {
+  any_device_has_nvlink = val;
+  nvtop_adjust_field_sizes_for_nvlink();
+}
+
 static unsigned int sizeof_process_field[process_field_count] = {
     [process_pid] = 7,       [process_user] = 4,          [process_gpu_id] = 3,   [process_type] = 8,
     [process_gpu_rate] = 4,  [process_enc_rate] = 4,      [process_dec_rate] = 4,
@@ -69,35 +95,43 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
       newwin(1, sizeof_device_field[device_pcie], start_row, start_col + spacer + sizeof_device_field[device_name]);
   if (dwin->pcie_info == NULL)
     goto alloc_error;
-  dwin->nvlink_info =
-      newwin(1, sizeof_device_field[device_pcie], start_row + 1, start_col + spacer + sizeof_device_field[device_name]);
-  if (dwin->nvlink_info == NULL)
-    goto alloc_error;
 
-  // Line 2 = GPU clk | MEM clk | Temp | Fan | Power
-  dwin->gpu_clock_info = newwin(1, sizeof_device_field[device_clock], start_row + 2, start_col);
+  // Line 2 = GPU clk | MEM clk | Temp | Fan | Power | NVLink
+  dwin->gpu_clock_info = newwin(1, sizeof_device_field[device_clock], start_row + 1, start_col);
   if (dwin->gpu_clock_info == NULL)
     goto alloc_error;
-  dwin->mem_clock_info = newwin(1, sizeof_device_field[device_mem_clock], start_row + 2,
+  dwin->mem_clock_info = newwin(1, sizeof_device_field[device_mem_clock], start_row + 1,
                                 start_col + spacer + sizeof_device_field[device_clock]);
   if (dwin->mem_clock_info == NULL)
     goto alloc_error;
   dwin->temperature =
-      newwin(1, sizeof_device_field[device_temperature], start_row + 2,
+      newwin(1, sizeof_device_field[device_temperature], start_row + 1,
              start_col + spacer * 2 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock]);
   if (dwin->temperature == NULL)
     goto alloc_error;
-  dwin->fan_speed = newwin(1, sizeof_device_field[device_fan_speed], start_row + 2,
+  dwin->fan_speed = newwin(1, sizeof_device_field[device_fan_speed], start_row + 1,
                            start_col + spacer * 3 + sizeof_device_field[device_clock] +
                                sizeof_device_field[device_mem_clock] + sizeof_device_field[device_temperature]);
   if (dwin->fan_speed == NULL)
     goto alloc_error;
   dwin->power_info =
-      newwin(1, sizeof_device_field[device_power], start_row + 2,
+      newwin(1, sizeof_device_field[device_power], start_row + 1,
              start_col + spacer * 4 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                  sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed]);
   if (dwin->power_info == NULL)
     goto alloc_error;
+  // NVLink appended to power_info on the same row (start_row + 1), using remaining width
+  if (any_device_has_nvlink) {
+    dwin->nvlink_info =
+        newwin(1, sizeof_device_field[device_pcie] - sizeof_device_field[device_power] - spacer * 3, start_row + 1,
+               start_col + spacer * 4 + sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
+                   sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
+                   spacer * 2 + sizeof_device_field[device_power]);
+    if (dwin->nvlink_info == NULL)
+      goto alloc_error;
+  } else {
+    dwin->nvlink_info = NULL;
+  }
 
   // Line 3 = GPU used | MEM used | Encoder | Decoder
 
@@ -135,49 +169,49 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
     size_encode += 1;
   size_encode /= 2;
 
-  dwin->gpu_util_enc_dec = newwin(1, size_gpu, start_row + 3, start_col);
+  dwin->gpu_util_enc_dec = newwin(1, size_gpu, start_row + 2, start_col);
   if (dwin->gpu_util_enc_dec == NULL)
     goto alloc_error;
-  dwin->mem_util_enc_dec = newwin(1, size_mem, start_row + 3, start_col + spacer + size_gpu);
+  dwin->mem_util_enc_dec = newwin(1, size_mem, start_row + 2, start_col + spacer + size_gpu);
   if (dwin->mem_util_enc_dec == NULL)
     goto alloc_error;
-  dwin->encode_util = newwin(1, size_encode, start_row + 3, start_col + spacer * 2 + size_gpu + size_mem);
+  dwin->encode_util = newwin(1, size_encode, start_row + 2, start_col + spacer * 2 + size_gpu + size_mem);
   if (dwin->encode_util == NULL)
     goto alloc_error;
-  dwin->decode_util = newwin(1, size_decode, start_row + 3, start_col + spacer * 3 + size_gpu + size_mem + size_encode);
+  dwin->decode_util = newwin(1, size_decode, start_row + 2, start_col + spacer * 3 + size_gpu + size_mem + size_encode);
   if (dwin->decode_util == NULL)
     goto alloc_error;
-  dwin->encdec_util = newwin(1, size_encode * 2, start_row + 3, start_col + spacer * 2 + size_gpu + size_mem);
+  dwin->encdec_util = newwin(1, size_encode * 2, start_row + 2, start_col + spacer * 2 + size_gpu + size_mem);
   if (dwin->encdec_util == NULL)
     goto alloc_error;
   // For auto-hide encode / decode window
-  dwin->gpu_util_no_enc_or_dec = newwin(1, size_gpu + size_encode / 2 + 1, start_row + 3, start_col);
+  dwin->gpu_util_no_enc_or_dec = newwin(1, size_gpu + size_encode / 2 + 1, start_row + 2, start_col);
   if (dwin->gpu_util_no_enc_or_dec == NULL)
     goto alloc_error;
   dwin->mem_util_no_enc_or_dec =
-      newwin(1, size_mem + size_encode / 2, start_row + 3, start_col + spacer + size_gpu + size_encode / 2 + 1);
+      newwin(1, size_mem + size_encode / 2, start_row + 2, start_col + spacer + size_gpu + size_encode / 2 + 1);
   if (dwin->mem_util_no_enc_or_dec == NULL)
     goto alloc_error;
-  dwin->gpu_util_no_enc_and_dec = newwin(1, size_gpu + size_encode + 1, start_row + 3, start_col);
+  dwin->gpu_util_no_enc_and_dec = newwin(1, size_gpu + size_encode + 1, start_row + 2, start_col);
   if (dwin->gpu_util_no_enc_and_dec == NULL)
     goto alloc_error;
   dwin->mem_util_no_enc_and_dec =
-      newwin(1, size_mem + size_encode + 1, start_row + 3, start_col + spacer + size_gpu + size_encode + 1);
+      newwin(1, size_mem + size_encode + 1, start_row + 2, start_col + spacer + size_gpu + size_encode + 1);
   if (dwin->mem_util_no_enc_and_dec == NULL)
     goto alloc_error;
   dwin->enc_was_visible = false;
   dwin->dec_was_visible = false;
 
   // Line 4 = Number of shading cores | L2 Features
-  dwin->shader_cores = newwin(1, sizeof_device_field[device_shadercores], start_row + 4, start_col);
+  dwin->shader_cores = newwin(1, sizeof_device_field[device_shadercores], start_row + 3, start_col);
   if (dwin->shader_cores == NULL)
     goto alloc_error;
-  dwin->l2_cache_size = newwin(1, sizeof_device_field[device_l2features], start_row + 4,
+  dwin->l2_cache_size = newwin(1, sizeof_device_field[device_l2features], start_row + 3,
                                start_col + spacer + sizeof_device_field[device_shadercores]);
   if (dwin->l2_cache_size == NULL)
     goto alloc_error;
   dwin->exec_engines =
-      newwin(1, sizeof_device_field[device_execengines], start_row + 4,
+      newwin(1, sizeof_device_field[device_execengines], start_row + 3,
              start_col + spacer * 2 + sizeof_device_field[device_shadercores] + sizeof_device_field[device_l2features]);
   if (dwin->exec_engines == NULL)
     goto alloc_error;
@@ -206,7 +240,8 @@ static void free_device_windows(struct device_window *dwin) {
   delwin(dwin->temperature);
   delwin(dwin->fan_speed);
   delwin(dwin->pcie_info);
-  delwin(dwin->nvlink_info);
+  if (dwin->nvlink_info != NULL)
+    delwin(dwin->nvlink_info);
 }
 
 static void alloc_process_with_option(struct nvtop_interface *interface, unsigned posX, unsigned posY, unsigned sizeX,
@@ -349,10 +384,18 @@ static void alloc_plot_window(unsigned devices_count, struct window_position *pl
 }
 
 static unsigned device_length(void) {
+  // When no NVLink anywhere, match original repo layout exactly
+  if (!any_device_has_nvlink) {
+    return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
+               sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
+                   sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
+                   sizeof_device_field[device_power] + 5);
+  }
+  // With NVLink: keep line 3 at original width (+3 compensates for fan 11->8, power stays 15)
   return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
              sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                  sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
-                 sizeof_device_field[device_power] + 5);
+                 sizeof_device_field[device_power] + 5 + 2);
 }
 
 static pid_t nvtop_pid;
@@ -369,7 +412,7 @@ static void initialize_all_windows(struct nvtop_interface *dwin) {
   struct window_position plot_positions[MAX_CHARTS];
   struct window_position setup_position;
 
-  compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 5 : 4, device_length(), rows - 1, cols,
+  compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 4 : 3, device_length(), rows - 1, cols,
                             dwin->options.gpu_specific_opts, dwin->options.process_fields_displayed, device_positions,
                             &dwin->num_plots, plot_positions, map_device_to_plot, &process_position, &setup_position,
                             dwin->options.hide_processes_list);
@@ -576,6 +619,8 @@ static void print_pcie_at_scale(WINDOW *win, unsigned int value) {
   wprintw(win, " %sB/s", memory_prefix[prefix_off]);
 }
 
+// Reuse existing print_pcie_at_scale for NVLink throughput (identical scale logic)
+
 static inline void werase_and_wnoutrefresh(WINDOW *w) {
   werase(w);
   wnoutrefresh(w);
@@ -779,19 +824,41 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
     // FAN
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed)) {
-      mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%%  ",
-                device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
-      mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL);
+      if (any_device_has_nvlink) {
+        mvwprintw(dev->fan_speed, 0, 0, "FAN %3u%%",
+                  device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
+        mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
+      } else {
+        mvwprintw(dev->fan_speed, 0, 0, " FAN %3u%%  ",
+                  device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
+        mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL);
+      }
     } else if (device->static_info.integrated_graphics) {
-      mvwprintw(dev->fan_speed, 0, 0, "  CPU-FAN  ");
-      mvwchgat(dev->fan_speed, 0, 2, 7, 0, cyan_color, NULL);
+      if (any_device_has_nvlink) {
+        mvwprintw(dev->fan_speed, 0, 0, "CPU-FAN");
+        mvwchgat(dev->fan_speed, 0, 0, 7, 0, cyan_color, NULL);
+      } else {
+        mvwprintw(dev->fan_speed, 0, 0, "  CPU-FAN  ");
+        mvwchgat(dev->fan_speed, 0, 2, 7, 0, cyan_color, NULL);
+      }
     } else if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm)) {
-      mvwprintw(dev->fan_speed, 0, 0, "FAN %4uRPM",
-                device->dynamic_info.fan_rpm > 9999 ? 9999 : device->dynamic_info.fan_rpm);
-      mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
+      if (any_device_has_nvlink) {
+        mvwprintw(dev->fan_speed, 0, 0, "FAN%3uR",
+                  device->dynamic_info.fan_rpm > 999 ? 999 : device->dynamic_info.fan_rpm);
+        mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
+      } else {
+        mvwprintw(dev->fan_speed, 0, 0, "FAN %4uRPM",
+                  device->dynamic_info.fan_rpm > 9999 ? 9999 : device->dynamic_info.fan_rpm);
+        mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
+      }
     } else {
-      mvwprintw(dev->fan_speed, 0, 0, "  FAN N/A  ");
-      mvwchgat(dev->fan_speed, 0, 2, 3, 0, cyan_color, NULL);
+      if (any_device_has_nvlink) {
+        mvwprintw(dev->fan_speed, 0, 0, "FAN N/A");
+        mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
+      } else {
+        mvwprintw(dev->fan_speed, 0, 0, "  FAN N/A  ");
+        mvwchgat(dev->fan_speed, 0, 2, 3, 0, cyan_color, NULL);
+      }
     }
     wnoutrefresh(dev->fan_speed);
 
@@ -831,6 +898,33 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
     mvwchgat(dev->power_info, 0, 0, 3, 0, cyan_color, NULL);
     wnoutrefresh(dev->power_info);
 
+    // NVLink info (on same row as power_info)
+    if (dev->nvlink_info != NULL) {
+      werase(dev->nvlink_info);
+      struct nvlink_info nvl_info;
+      unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
+      if (nvlinks > 0 && nvl_info.supported) {
+        wcolor_set(dev->nvlink_info, cyan_color, NULL);
+        wprintw(dev->nvlink_info, "NVL");
+        wcolor_set(dev->nvlink_info, magenta_color, NULL);
+        if (nvl_info.version > 0)
+          wprintw(dev->nvlink_info, "%u", nvl_info.version);
+        else
+          wprintw(dev->nvlink_info, "?");
+        wstandend(dev->nvlink_info);
+        if (nvl_info.num_links < 10)
+          wprintw(dev->nvlink_info, " %ux ", nvl_info.num_links);
+        else
+          wprintw(dev->nvlink_info, "%ux ", nvl_info.num_links);
+
+        if (nvl_info.has_throughput) {
+          unsigned total_kib = (unsigned)(nvl_info.aggregate_tx + nvl_info.aggregate_rx);
+          print_pcie_at_scale(dev->nvlink_info, total_kib);
+        }
+      }
+      wnoutrefresh(dev->nvlink_info);
+    }
+
     // PICe throughput
     werase(dev->pcie_info);
     if (device->static_info.integrated_graphics) {
@@ -866,84 +960,6 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
     wnoutrefresh(dev->pcie_info);
 
-    // NVLink info
-    werase(dev->nvlink_info);
-    wcolor_set(dev->nvlink_info, cyan_color, NULL);
-    mvwprintw(dev->nvlink_info, 0, 0, "NVLink ");
-    wstandend(dev->nvlink_info);
-
-    struct nvlink_info nvl_info;
-    unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
-    if (nvlinks > 0 && nvl_info.supported) {
-      int pos = 7;
-      // Check if any link has throughput data (not available on consumer GPUs)
-      bool has_throughput = false;
-      for (unsigned link = 0; link < nvl_info.num_links; link++) {
-        if (nvl_info.links[link].throughput_tx > 0 || nvl_info.links[link].throughput_rx > 0) {
-          has_throughput = true;
-          break;
-        }
-      }
-
-      for (unsigned link = 0; link < nvl_info.num_links && pos < 44; link++) {
-        const struct nvlink_link_info *l = &nvl_info.links[link];
-        // Link indicator: A=active, x=inactive
-        wcolor_set(dev->nvlink_info, l->active ? green_color : red_color, NULL);
-        mvwprintw(dev->nvlink_info, 0, pos, "L%d%c", link, l->active ? 'A' : 'x');
-        wstandend(dev->nvlink_info);
-        pos += 4;
-        // Throughput (only shown when actually available)
-        if (has_throughput) {
-          if (pos + 10 > 44)
-            break;
-          // TX
-          if (l->throughput_tx > 0) {
-            if (l->throughput_tx > 1024) {
-              mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_tx / 1024));
-            } else {
-              mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_tx);
-            }
-          } else {
-            mvwprintw(dev->nvlink_info, 0, pos, "  -");
-          }
-          pos += 4;
-          // Separator
-          if (pos < 44) {
-            mvwprintw(dev->nvlink_info, 0, pos, "/");
-            pos++;
-          }
-          // RX
-          if (pos + 5 > 44)
-            break;
-          if (l->throughput_rx > 0) {
-            if (l->throughput_rx > 1024) {
-              mvwprintw(dev->nvlink_info, 0, pos, "%3uM", (unsigned)(l->throughput_rx / 1024));
-            } else {
-              mvwprintw(dev->nvlink_info, 0, pos, "%3uk", (unsigned)l->throughput_rx);
-            }
-          } else {
-            mvwprintw(dev->nvlink_info, 0, pos, "  -");
-          }
-          pos += 4;
-        }
-        // Error indicator
-        if (pos < 44 && (l->errors_replay || l->errors_recovery || l->errors_crc_flit || l->errors_crc_data || l->errors_ecc_data)) {
-          wcolor_set(dev->nvlink_info, red_color, NULL);
-          mvwprintw(dev->nvlink_info, 0, pos, "!");
-          wstandend(dev->nvlink_info);
-          pos++;
-        }
-        if (pos < 44) {
-          mvwprintw(dev->nvlink_info, 0, pos, " ");
-          pos++;
-        }
-      }
-    } else {
-      wprintw(dev->nvlink_info, "N/A");
-    }
-
-    wnoutrefresh(dev->nvlink_info);
-
     if (interface->options.has_gpu_info_bar) {
       // Number of shader cores
       werase(dev->shader_cores);
diff --git a/src/nvtop.c b/src/nvtop.c
index 61302d29..12314cd8 100644
--- a/src/nvtop.c
+++ b/src/nvtop.c
@@ -312,6 +312,9 @@ int main(int argc, char **argv) {
   unsigned numMonitoredGpus =
       interface_check_and_fix_monitored_gpus(allDevCount, &monitoredGpus, &nonMonitoredGpus, &allDevicesOptions);
 
+  // Probe for NVLink before layout computation
+  nvtop_set_nvlink_probe(nvtop_probe_nvlink_list(&monitoredGpus));
+
   if (allDevicesOptions.show_startup_messages) {
     bool dont_show_again = show_information_messages(numWarningMessages, warningMessages);
     if (dont_show_again) {
@@ -335,6 +338,7 @@ int main(int argc, char **argv) {
       update_window_size_to_terminal_size(interface);
     }
     interface_check_monitored_gpu_change(&interface, allDevCount, &numMonitoredGpus, &monitoredGpus, &nonMonitoredGpus);
+    nvtop_set_nvlink_probe(nvtop_probe_nvlink_list(&monitoredGpus));
     if (time_slept >= interface_update_interval(interface)) {
       gpuinfo_refresh_dynamic_info(&monitoredGpus);
       if (!interface_freeze_processes(interface)) {

From 1b988061cd379a6cc15076040a5fda325b810cb7 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Fri, 1 May 2026 15:32:51 -0400
Subject: [PATCH 05/31] feat: add NVLink error/correction counters, fix display
 layout, rename scale function

Add NVLink error and CRC correction counter display on line 4 of the
device panel, showing cumulative errors (replay, recovery, CRC FLIT,
CRC DATA) and per-lane CRC flit corrections with conditional coloring
(errors in red, corrections in yellow). Counters use baseline
subtraction so they start at zero on nvtop launch and only increment
when new errors/corrections occur.

Display format: NVL E:00000 C:00000 (19 chars), with "NVL" in cyan
and numeric values conditionally colored. Window is allocated only
for devices with NVLink support.

- src/extract_gpuinfo_nvidia.c: Add nvlink_read_errors() function
  using nvmlDeviceGetNvLinkErrorCounter and nvmlDeviceGetFieldValues
  with baseline tracking per-device.

- include/nvtop/extract_gpuinfo_common.h: Add total_errors and
  total_corrections fields to struct nvlink_info.

- src/interface.c: Add nvlink_errors window allocation, deallocation,
  and display logic in draw_devices().

- include/nvtop/interface_internal_common.h: Add nvlink_errors window
  pointer and device_nvlink_errors enum entry.

- Rename print_pcie_at_scale() to print_data_at_scale() and extend
  loop from 5 to 6 to support TiB/s (future-proofing for NVLink 5.0).

- Fix FAN field width (8 -> 11 chars) and reduce spacing to fit all
  fields within the device panel width.

- Fix device_length() to use max() across all three device lines
  instead of only lines 1 and 2.
---
 include/nvtop/extract_gpuinfo_common.h    |   2 +
 include/nvtop/interface_internal_common.h |   2 +
 src/extract_gpuinfo_nvidia.c              | 100 ++++++++++++++++++++++
 src/interface.c                           |  48 +++++++++--
 4 files changed, 146 insertions(+), 6 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 1324cbd9..577ad2a7 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -250,6 +250,8 @@ struct nvlink_info {
   bool has_throughput;                // Whether throughput data was available this cycle
   unsigned long long aggregate_tx;    // Aggregate TX throughput across all links (KiB/s)
   unsigned long long aggregate_rx;    // Aggregate RX throughput across all links (KiB/s)
+  unsigned long long total_errors;    // Cumulative-since-launch errors across all links
+  unsigned long long total_corrections; // Cumulative-since-launch CRC corrections across all links
 };
 
 unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info);
diff --git a/include/nvtop/interface_internal_common.h b/include/nvtop/interface_internal_common.h
index f5091998..e549a261 100644
--- a/include/nvtop/interface_internal_common.h
+++ b/include/nvtop/interface_internal_common.h
@@ -74,6 +74,7 @@ struct device_window {
   WINDOW *shader_cores;
   WINDOW *l2_cache_size;
   WINDOW *exec_engines;
+  WINDOW *nvlink_errors;
   bool enc_was_visible;
   bool dec_was_visible;
   nvtop_time last_decode_seen;
@@ -155,6 +156,7 @@ enum device_field {
   device_shadercores,
   device_l2features,
   device_execengines,
+  device_nvlink_errors,
   device_field_count,
 };
 
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 6fb305dc..3f7161fd 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -211,6 +211,8 @@ nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentM
 // NVLink functions (not present in older NVML versions, gracefully handled)
 static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t device, unsigned int link, unsigned int *version);
+static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int counter, unsigned int link, unsigned long long *value);
+static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t device, int numFields, int *fieldIds, void *fieldValues);
 
 static void *libnvidia_ml_handle;
 
@@ -290,6 +292,11 @@ struct gpu_info_nvidia {
   nvtop_time last_nvlink_cli_time; // Timestamp of last CLI poll (uses app's existing time API)
   unsigned long long smoothed_agg_tx; // EMA-smoothed aggregate TX for display
   unsigned long long smoothed_agg_rx; // EMA-smoothed aggregate RX for display
+
+  // NVLink error counter baselines (cumulative since boot, tracked per-device)
+  unsigned long long baseline_errors; // Cumulative errors at last read
+  unsigned long long baseline_corrections; // Cumulative corrections at last read
+  bool nvlink_errors_baseline_read; // True after first read establishes baseline
 };
 
 static LIST_HEAD(allocations);
@@ -487,6 +494,8 @@ static bool gpuinfo_nvidia_init(void) {
   // NVLink functions (optional - not available on all drivers/hardware)
   nvmlDeviceGetNvLinkState = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkState");
   nvmlDeviceGetNvLinkVersion = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkVersion");
+  nvmlDeviceGetNvLinkErrorCounter = dlsym(libnvidia_ml_handle, "nvmlDeviceGetNvLinkErrorCounter");
+  nvmlDeviceGetFieldValues = dlsym(libnvidia_ml_handle, "nvmlDeviceGetFieldValues");
 
   last_nvml_return_status = nvmlInit();
   if (last_nvml_return_status != NVML_SUCCESS) {
@@ -959,12 +968,100 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 // NVML NVLink enums (defined locally since we don't have nvml.h)
 #define NVML_NVLINK_MAX_LINKS_INTERNAL 18
 
+// NVML error counter types
+#define NVML_NVLINK_ERROR_DL_REPLAY   0
+#define NVML_NVLINK_ERROR_DL_RECOVERY 1
+#define NVML_NVLINK_ERROR_DL_CRC_FLIT 2
+#define NVML_NVLINK_ERROR_DL_CRC_DATA 3
+#define NVML_NVLINK_ERROR_DL_ECC_DATA 4
+
+// nvmlFieldValue_t struct layout (no header available — offsets documented)
+// Total size: 8 bytes (fieldId:4, valueType:4, value.union:4) = 12 bytes per entry
+#define NVM_LVALUE_FIELD_ID_OFF     0
+#define NVM_LVALUE_VALUE_TYPE_OFF   4
+#define NVM_LVALUE_UINT64_OFF       8
+#define NVM_LVALUE_SIZE             12
+
 #include <stdio.h>
 #include <string.h>
 
 // Forward declaration
 struct gpu_info_nvidia;
 
+// Read NVLink error counters and CRC corrections into nvlink_info->total_errors and total_corrections.
+// Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
+// Phase 1: nvmlDeviceGetNvLinkErrorCounter for replay, recovery, CRC errors per link.
+// Phase 2: nvmlDeviceGetFieldValues for per-lane CRC flit corrections (field IDs 32-49 for links 0-17).
+static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info, struct nvlink_info *nvlink_info) {
+  if (!nvlink_info)
+    return;
+
+  // Phase 1: error counters via nvmlDeviceGetNvLinkErrorCounter
+  unsigned long long cumulative_errors = 0;
+  if (nvmlDeviceGetNvLinkErrorCounter) {
+    for (unsigned int link = 0; link < linkCount; link++) {
+      unsigned long long val = 0;
+      nvmlReturn_t ret;
+      // Replay errors
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_REPLAY, link, &val);
+      if (ret == NVML_SUCCESS) cumulative_errors += val;
+      // Recovery errors
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_RECOVERY, link, &val);
+      if (ret == NVML_SUCCESS) cumulative_errors += val;
+      // CRC FLIT errors
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_CRC_FLIT, link, &val);
+      if (ret == NVML_SUCCESS) cumulative_errors += val;
+      // CRC DATA errors
+      ret = nvmlDeviceGetNvLinkErrorCounter(device, NVML_NVLINK_ERROR_DL_CRC_DATA, link, &val);
+      if (ret == NVML_SUCCESS) cumulative_errors += val;
+    }
+  }
+
+  // Phase 2: per-lane CRC corrections via nvmlDeviceGetFieldValues
+  // Field IDs: link 0 = 32-37, link 1 = 38-43, link 2 = 44-49, etc. (6 field IDs per link for lanes 0-5)
+  unsigned long long cumulative_corrections = 0;
+  if (nvmlDeviceGetFieldValues) {
+    for (unsigned int link = 0; link < linkCount; link++) {
+      // Query 6 field IDs per link (lanes 0-5)
+      int base_field_id = 32 + link * 6;
+      int field_ids[6];
+      // Raw bytes for nvmlFieldValue_t structs (12 bytes each)
+      char raw[6 * NVM_LVALUE_SIZE];
+
+      for (int i = 0; i < 6; i++)
+        field_ids[i] = base_field_id + i;
+
+      nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 6, field_ids, raw);
+      if (ret == NVML_SUCCESS) {
+        for (int i = 0; i < 6; i++) {
+          // Read the uint64 value from the raw bytes
+          unsigned long long val = 0;
+          memcpy(&val, raw + i * NVM_LVALUE_SIZE + NVM_LVALUE_UINT64_OFF, sizeof(val));
+          cumulative_corrections += val;
+        }
+      }
+    }
+  }
+
+  // Baseline subtraction: show only errors/corrections since nvtop launch
+  if (!gpu_info->nvlink_errors_baseline_read) {
+    // First read — establish baseline, display zeros
+    gpu_info->baseline_errors = cumulative_errors;
+    gpu_info->baseline_corrections = cumulative_corrections;
+    gpu_info->nvlink_errors_baseline_read = true;
+    nvlink_info->total_errors = 0;
+    nvlink_info->total_corrections = 0;
+  } else {
+    // Subsequent reads — show delta from baseline
+    nvlink_info->total_errors = cumulative_errors > gpu_info->baseline_errors
+                                   ? cumulative_errors - gpu_info->baseline_errors
+                                   : 0;
+    nvlink_info->total_corrections = cumulative_corrections > gpu_info->baseline_corrections
+                                        ? cumulative_corrections - gpu_info->baseline_corrections
+                                        : 0;
+  }
+}
+
 // Parse nvidia-smi nvlink --getthroughput d output
 // Returns number of links parsed (0 on failure)
 static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_count,
@@ -1099,5 +1196,8 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     nvlink_info->aggregate_rx = gpu_info->smoothed_agg_rx;
   }
 
+  // Error counters and CRC corrections
+  nvlink_read_errors(device, linkCount, gpu_info, nvlink_info);
+
   return nvlink_info->num_links;
 }
diff --git a/src/interface.c b/src/interface.c
index 63ad99df..27929f5d 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -46,6 +46,7 @@ static unsigned int sizeof_device_field[device_field_count] = {
     [device_name] = 11,       [device_fan_speed] = 11,   [device_temperature] = 10, [device_power] = 15,
     [device_clock] = 11,      [device_mem_clock] = 12,   [device_pcie] = 46,        [device_shadercores] = 7,
     [device_l2features] = 11, [device_execengines] = 11,
+    [device_nvlink_errors] = 19,
 };
 
 // True if any monitored device has NVLink — set before layout is computed
@@ -215,6 +216,17 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
              start_col + spacer * 2 + sizeof_device_field[device_shadercores] + sizeof_device_field[device_l2features]);
   if (dwin->exec_engines == NULL)
     goto alloc_error;
+  // NVLink errors appended to exec_engines on the same row (start_row + 3), conditional on NVLink
+  if (any_device_has_nvlink) {
+    dwin->nvlink_errors =
+        newwin(1, sizeof_device_field[device_nvlink_errors], start_row + 3,
+               start_col + spacer * 3 + sizeof_device_field[device_shadercores] +
+                   sizeof_device_field[device_l2features] + sizeof_device_field[device_execengines]);
+    if (dwin->nvlink_errors == NULL)
+      goto alloc_error;
+  } else {
+    dwin->nvlink_errors = NULL;
+  }
 
   return;
 alloc_error:
@@ -242,6 +254,8 @@ static void free_device_windows(struct device_window *dwin) {
   delwin(dwin->pcie_info);
   if (dwin->nvlink_info != NULL)
     delwin(dwin->nvlink_info);
+  if (dwin->nvlink_errors != NULL)
+    delwin(dwin->nvlink_errors);
 }
 
 static void alloc_process_with_option(struct nvtop_interface *interface, unsigned posX, unsigned posY, unsigned sizeX,
@@ -601,10 +615,10 @@ static void draw_temp_color(WINDOW *win, unsigned int temp, unsigned int temp_sl
   wnoutrefresh(win);
 }
 
-static void print_pcie_at_scale(WINDOW *win, unsigned int value) {
+static void print_data_at_scale(WINDOW *win, unsigned int value) {
   int prefix_off;
   double val_d = value;
-  for (prefix_off = 1; prefix_off < 5 && val_d >= 1000.; ++prefix_off) {
+  for (prefix_off = 1; prefix_off < 6 && val_d >= 1000.; ++prefix_off) {
     val_d = val_d / 1024.;
   }
   if (val_d >= 100.) {
@@ -619,7 +633,7 @@ static void print_pcie_at_scale(WINDOW *win, unsigned int value) {
   wprintw(win, " %sB/s", memory_prefix[prefix_off]);
 }
 
-// Reuse existing print_pcie_at_scale for NVLink throughput (identical scale logic)
+// Renamed from print_pcie_at_scale -> print_data_at_scale: reused for NVLink throughput (identical scale logic, bounds check extended to support TiB/s)
 
 static inline void werase_and_wnoutrefresh(WINDOW *w) {
   werase(w);
@@ -919,7 +933,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
         if (nvl_info.has_throughput) {
           unsigned total_kib = (unsigned)(nvl_info.aggregate_tx + nvl_info.aggregate_rx);
-          print_pcie_at_scale(dev->nvlink_info, total_kib);
+          print_data_at_scale(dev->nvlink_info, total_kib);
         }
       }
       wnoutrefresh(dev->nvlink_info);
@@ -947,14 +961,14 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
     wprintw(dev->pcie_info, " RX: ");
     wstandend(dev->pcie_info);
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_rx))
-      print_pcie_at_scale(dev->pcie_info, device->dynamic_info.pcie_rx);
+      print_data_at_scale(dev->pcie_info, device->dynamic_info.pcie_rx);
     else
       wprintw(dev->pcie_info, "N/A");
     wcolor_set(dev->pcie_info, magenta_color, NULL);
     wprintw(dev->pcie_info, " TX: ");
     wstandend(dev->pcie_info);
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, pcie_tx))
-      print_pcie_at_scale(dev->pcie_info, device->dynamic_info.pcie_tx);
+      print_data_at_scale(dev->pcie_info, device->dynamic_info.pcie_tx);
     else
       wprintw(dev->pcie_info, "N/A");
 
@@ -996,6 +1010,28 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
         wprintw(dev->exec_engines, "N/A");
 
       wnoutrefresh(dev->exec_engines);
+
+      // NVLink errors/corrections (conditional on NVLink)
+      if (dev->nvlink_errors != NULL) {
+        werase(dev->nvlink_errors);
+        struct nvlink_info nvl_info;
+        unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
+        if (nvlinks > 0 && nvl_info.supported) {
+          wcolor_set(dev->nvlink_errors, cyan_color, NULL);
+          wprintw(dev->nvlink_errors, "NVL");
+          wstandend(dev->nvlink_errors);
+          wprintw(dev->nvlink_errors, " E:");
+          if (nvl_info.total_errors > 0)
+            wcolor_set(dev->nvlink_errors, red_color, NULL);
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(nvl_info.total_errors % 100000));
+          wstandend(dev->nvlink_errors);
+          wprintw(dev->nvlink_errors, " C:");
+          if (nvl_info.total_corrections > 0)
+            wcolor_set(dev->nvlink_errors, yellow_color, NULL);
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(nvl_info.total_corrections % 100000));
+        }
+        wnoutrefresh(dev->nvlink_errors);
+      }
     }
 
     dev_id++;

From 3f0face398fe9be50289886b6cddbb1023ca616a Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Fri, 1 May 2026 20:00:49 -0400
Subject: [PATCH 06/31] fix: baseline reset for NVLink error counters, fix
 nvmlFieldValue_t layout, update display getter

Move nvlink_read_errors() out of nvtop_get_nvlink_info() and into
gpuinfo_nvidia_refresh_dynamic_info() so the baseline is not established
during the startup probe. This ensures E:00000 C:00000 on every nvtop
launch.

- Add display_errors/display_corrections fields to struct gpu_info_nvidia
- Add nvtop_get_nvlink_error_counts() public getter (extract_gpuinfo_common.h)
- Update interface.c to use the getter instead of nvtop_get_nvlink_info()
- Fix nvmlFieldValue_t struct offsets: 48 bytes (not 12), ullVal at offset 32
- Fix dlsym signature for nvmlDeviceGetFieldValues (remove fieldIds parameter)
- Populate fieldId in-place in the raw buffer before calling GetFieldValues
---
 include/nvtop/extract_gpuinfo_common.h |   6 ++
 src/extract_gpuinfo_nvidia.c           | 100 ++++++++++++++++++-------
 src/interface.c                        |  13 ++--
 3 files changed, 85 insertions(+), 34 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 577ad2a7..f1019efb 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -256,6 +256,12 @@ struct nvlink_info {
 
 unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info);
 
+// Get display-ready NVLink error/correction counts from the per-device persistent struct.
+// Returns true if baseline has been established at least once.
+bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
+                                    unsigned long long *out_errors,
+                                    unsigned long long *out_corrections);
+
 // NVLink probe — call before initialize_curses to set layout mode
 bool nvtop_probe_nvlink_list(struct list_head *devices);
 void nvtop_set_nvlink_probe(bool val);
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 3f7161fd..c47bde9b 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -212,7 +212,7 @@ nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentM
 static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t device, unsigned int link, unsigned int *version);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int counter, unsigned int link, unsigned long long *value);
-static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t device, int numFields, int *fieldIds, void *fieldValues);
+static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t device, unsigned int valuesCount, void *fieldValues);
 
 static void *libnvidia_ml_handle;
 
@@ -297,6 +297,10 @@ struct gpu_info_nvidia {
   unsigned long long baseline_errors; // Cumulative errors at last read
   unsigned long long baseline_corrections; // Cumulative corrections at last read
   bool nvlink_errors_baseline_read; // True after first read establishes baseline
+
+  // Display-ready error/correction counts (computed in refresh_dynamic_info)
+  unsigned long long display_errors; // Errors since nvtop launch
+  unsigned long long display_corrections; // Corrections since nvtop launch
 };
 
 static LIST_HEAD(allocations);
@@ -309,6 +313,9 @@ static void gpuinfo_nvidia_populate_static_info(struct gpu_info *_gpu_info);
 static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info);
 static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info);
 
+// Forward declaration for nvlink_read_errors (defined later, called from refresh_dynamic_info)
+static void nvlink_read_errors(struct nvmlDevice *device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info);
+
 struct gpu_vendor gpu_vendor_nvidia = {
     .init = gpuinfo_nvidia_init,
     .shutdown = gpuinfo_nvidia_shutdown,
@@ -777,6 +784,24 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
       SET_GPUINFO_DYNAMIC(dynamic_info, multi_instance_mode, currentMode == NVML_DEVICE_MIG_ENABLE);
     }
   }
+
+  // NVLink error counters (called here, not in nvtop_get_nvlink_info, to avoid the startup probe
+  // establishing the baseline early and causing non-zero counters on first display refresh)
+  if (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues) {
+    unsigned int linkCount = 0;
+    if (nvmlDeviceGetNvLinkState) {
+      for (unsigned int link = 0; link < 18; link++) {
+        unsigned int isActive = 0;
+        nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
+        if (ret == NVML_SUCCESS)
+          linkCount = link + 1;
+        else if (ret != NVML_ERROR_NOT_SUPPORTED)
+          break;
+      }
+    }
+    if (linkCount > 0)
+      nvlink_read_errors(device, linkCount, gpu_info);
+  }
 }
 
 static void gpuinfo_nvidia_get_process_utilization(struct gpu_info_nvidia *gpu_info, unsigned num_processes_recovered,
@@ -975,12 +1000,16 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 #define NVML_NVLINK_ERROR_DL_CRC_DATA 3
 #define NVML_NVLINK_ERROR_DL_ECC_DATA 4
 
-// nvmlFieldValue_t struct layout (no header available — offsets documented)
-// Total size: 8 bytes (fieldId:4, valueType:4, value.union:4) = 12 bytes per entry
+// nvmlFieldValue_t struct layout (from nvml.h — offsets may vary by driver version)
+// Total size: 48 bytes (NVML 11.515+, verified on driver 580.142)
+// Layout: fieldId:u32(0), scopeId:u32(4), timestamp:u64(8), latencyUsec:u64(16),
+//         valueType:u32(24), nvmlReturn:u32(28), value.union(32) [ullVal at offset 32],
+// NOTE: nvmlFieldValue_t layout varies across NVML/driver versions.
+// Always verify against the header shipped with the driver you're targeting.
 #define NVM_LVALUE_FIELD_ID_OFF     0
 #define NVM_LVALUE_VALUE_TYPE_OFF   4
-#define NVM_LVALUE_UINT64_OFF       8
-#define NVM_LVALUE_SIZE             12
+#define NVM_LVALUE_UINT64_OFF       32
+#define NVM_LVALUE_SIZE             48
 
 #include <stdio.h>
 #include <string.h>
@@ -988,14 +1017,12 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 // Forward declaration
 struct gpu_info_nvidia;
 
-// Read NVLink error counters and CRC corrections into nvlink_info->total_errors and total_corrections.
+// Read NVLink error counters and CRC corrections, storing results in the persistent gpu_info struct.
 // Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
+// Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
 // Phase 1: nvmlDeviceGetNvLinkErrorCounter for replay, recovery, CRC errors per link.
 // Phase 2: nvmlDeviceGetFieldValues for per-lane CRC flit corrections (field IDs 32-49 for links 0-17).
-static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info, struct nvlink_info *nvlink_info) {
-  if (!nvlink_info)
-    return;
-
+static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info) {
   // Phase 1: error counters via nvmlDeviceGetNvLinkErrorCounter
   unsigned long long cumulative_errors = 0;
   if (nvmlDeviceGetNvLinkErrorCounter) {
@@ -1019,22 +1046,26 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
 
   // Phase 2: per-lane CRC corrections via nvmlDeviceGetFieldValues
   // Field IDs: link 0 = 32-37, link 1 = 38-43, link 2 = 44-49, etc. (6 field IDs per link for lanes 0-5)
+  // The caller must populate fieldId in each nvmlFieldValue_t entry BEFORE calling;
+  // the library populates the value fields on return.
+  // nvmlFieldValue_t is 48 bytes: fieldId:u32(0), scopeId:u32(4), timestamp:u64(8),
+  // latencyUsec:u64(16), valueType:u32(24), nvmlReturn:u32(28), value.union(32).
   unsigned long long cumulative_corrections = 0;
   if (nvmlDeviceGetFieldValues) {
     for (unsigned int link = 0; link < linkCount; link++) {
-      // Query 6 field IDs per link (lanes 0-5)
       int base_field_id = 32 + link * 6;
-      int field_ids[6];
-      // Raw bytes for nvmlFieldValue_t structs (12 bytes each)
       char raw[6 * NVM_LVALUE_SIZE];
 
-      for (int i = 0; i < 6; i++)
-        field_ids[i] = base_field_id + i;
+      // Zero out the buffer, then populate fieldId in each entry (offset 0, uint32_t)
+      memset(raw, 0, sizeof(raw));
+      for (int i = 0; i < 6; i++) {
+        unsigned int fid = (unsigned int)(base_field_id + i);
+        memcpy(raw + i * NVM_LVALUE_SIZE + NVM_LVALUE_FIELD_ID_OFF, &fid, sizeof(fid));
+      }
 
-      nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 6, field_ids, raw);
+      nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 6, raw);
       if (ret == NVML_SUCCESS) {
         for (int i = 0; i < 6; i++) {
-          // Read the uint64 value from the raw bytes
           unsigned long long val = 0;
           memcpy(&val, raw + i * NVM_LVALUE_SIZE + NVM_LVALUE_UINT64_OFF, sizeof(val));
           cumulative_corrections += val;
@@ -1049,19 +1080,33 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
     gpu_info->baseline_errors = cumulative_errors;
     gpu_info->baseline_corrections = cumulative_corrections;
     gpu_info->nvlink_errors_baseline_read = true;
-    nvlink_info->total_errors = 0;
-    nvlink_info->total_corrections = 0;
+    gpu_info->display_errors = 0;
+    gpu_info->display_corrections = 0;
   } else {
     // Subsequent reads — show delta from baseline
-    nvlink_info->total_errors = cumulative_errors > gpu_info->baseline_errors
-                                   ? cumulative_errors - gpu_info->baseline_errors
-                                   : 0;
-    nvlink_info->total_corrections = cumulative_corrections > gpu_info->baseline_corrections
-                                        ? cumulative_corrections - gpu_info->baseline_corrections
-                                        : 0;
+    gpu_info->display_errors = cumulative_errors > gpu_info->baseline_errors
+                                  ? cumulative_errors - gpu_info->baseline_errors
+                                  : 0;
+    gpu_info->display_corrections = cumulative_corrections > gpu_info->baseline_corrections
+                                       ? cumulative_corrections - gpu_info->baseline_corrections
+                                       : 0;
   }
 }
 
+// Public getter for display-ready error/correction counts from a struct gpu_info.
+// Returns true if data is available (errors or corrections read at least once).
+bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
+                                    unsigned long long *out_errors,
+                                    unsigned long long *out_corrections) {
+  struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
+  if (!gpu_info->nvlink_errors_baseline_read) {
+    return false;
+  }
+  *out_errors = gpu_info->display_errors;
+  *out_corrections = gpu_info->display_corrections;
+  return true;
+}
+
 // Parse nvidia-smi nvlink --getthroughput d output
 // Returns number of links parsed (0 on failure)
 static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_count,
@@ -1196,8 +1241,9 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     nvlink_info->aggregate_rx = gpu_info->smoothed_agg_rx;
   }
 
-  // Error counters and CRC corrections
-  nvlink_read_errors(device, linkCount, gpu_info, nvlink_info);
+  // Error counters are read separately via nvlink_read_errors() called from
+  // the display loop (draw_devices). Do NOT call here to avoid the startup
+  // probe (nvtop_probe_nvlink_list) from establishing the baseline early.
 
   return nvlink_info->num_links;
 }
diff --git a/src/interface.c b/src/interface.c
index 27929f5d..be175e2c 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -1014,21 +1014,20 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
       // NVLink errors/corrections (conditional on NVLink)
       if (dev->nvlink_errors != NULL) {
         werase(dev->nvlink_errors);
-        struct nvlink_info nvl_info;
-        unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
-        if (nvlinks > 0 && nvl_info.supported) {
+        unsigned long long err_cnt = 0, cor_cnt = 0;
+        if (nvtop_get_nvlink_error_counts(device, &err_cnt, &cor_cnt)) {
           wcolor_set(dev->nvlink_errors, cyan_color, NULL);
           wprintw(dev->nvlink_errors, "NVL");
           wstandend(dev->nvlink_errors);
           wprintw(dev->nvlink_errors, " E:");
-          if (nvl_info.total_errors > 0)
+          if (err_cnt > 0)
             wcolor_set(dev->nvlink_errors, red_color, NULL);
-          wprintw(dev->nvlink_errors, "%05u", (unsigned)(nvl_info.total_errors % 100000));
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(err_cnt % 100000));
           wstandend(dev->nvlink_errors);
           wprintw(dev->nvlink_errors, " C:");
-          if (nvl_info.total_corrections > 0)
+          if (cor_cnt > 0)
             wcolor_set(dev->nvlink_errors, yellow_color, NULL);
-          wprintw(dev->nvlink_errors, "%05u", (unsigned)(nvl_info.total_corrections % 100000));
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(cor_cnt % 100000));
         }
         wnoutrefresh(dev->nvlink_errors);
       }

From 02464d689ef045c63057d285bb0978d19598c521 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Fri, 1 May 2026 22:19:49 -0400
Subject: [PATCH 07/31] feat: switch NVLink throughput from data-only to raw
 counters (--getthroughput r)

Use raw (payload + protocol overhead) counters instead of data-only for
the nvidia-smi CLI fallback path. This ensures fully saturated links
show the rated link speed (e.g. ~14.062 GB/s per link on NVLink 3.0)
rather than roughly half that from data-only counters.

- Changed --getthroughput d to --getthroughput r
- Updated parsing from 'Data Tx/Rx' to 'Raw Tx/Rx'
- Added explanatory code comments
---
 src/extract_gpuinfo_nvidia.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index c47bde9b..cfa78c1b 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1107,12 +1107,14 @@ bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
   return true;
 }
 
-// Parse nvidia-smi nvlink --getthroughput d output
+// Parse nvidia-smi nvlink --getthroughput r output
+// "r" (raw) includes payload + protocol overhead — needed to show true bandwidth utilization
+// (consumer GPUs do not expose NVML nvmlDeviceGetNvLinkUtilizationCounter)
 // Returns number of links parsed (0 on failure)
 static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_count,
                                           unsigned long long *tx_out, unsigned long long *rx_out) {
   char cmd[256];
-  snprintf(cmd, sizeof(cmd), "nvidia-smi nvlink --getthroughput d -i %d 2>/dev/null", device_index);
+  snprintf(cmd, sizeof(cmd), "nvidia-smi nvlink --getthroughput r -i %d 2>/dev/null", device_index);
 
   FILE *fp = popen(cmd, "r");
   if (!fp)
@@ -1129,10 +1131,10 @@ static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_co
     char *p = line;
     while (*p == '\t' || *p == ' ')
       p++;
-    if (sscanf(p, "Link %u: Data Tx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
+    if (sscanf(p, "Link %u: Raw Tx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
       tx_out[link] = val;
       parsed++;
-    } else if (sscanf(p, "Link %u: Data Rx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
+    } else if (sscanf(p, "Link %u: Raw Rx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
       rx_out[link] = val;
       parsed++;
     }

From 336ee079fc5a21e277e3c4ba739e7b500f211999 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Fri, 1 May 2026 22:31:19 -0400
Subject: [PATCH 08/31] docs: add TODO for datacenter GPU NVML API path, remove
 stale EMA comment

Add a code comment guiding future developers with datacenter NVLink
hardware (A100, H100) to replace the CLI fallback with the NVML
nvmlDeviceGetNvLinkUtilizationCounter API, while keeping the CLI as a
consumer GPU fallback. Also remove the misleading 'EMA smoothing' comment
on the aggregate throughput output since no smoothing is actually applied.
---
 src/extract_gpuinfo_nvidia.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index cfa78c1b..e8b4c3a1 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1204,8 +1204,14 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
   nvlink_info->num_links = linkCount;
   nvlink_info->version = version;
 
-  // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs)
-  // Poll every 2 seconds to keep CPU overhead low
+  // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs).
+  // Poll every 2 seconds to keep CPU overhead low.
+  //
+  // TODO: On datacenter GPUs (A100, H100, etc.) that expose NVML
+  // nvmlDeviceGetNvLinkUtilizationCounter, replace this CLI path with the
+  // direct API call (zero process overhead). Keep this nvidia-smi CLI code
+  // as a conditional fallback for consumer GPUs (RTX 3090, 3080 Ti) where
+  // the NVML utilization counter is not exposed.
   nvtop_time current_time;
   nvtop_get_current_time(&current_time);
   if (gpu_info->last_nvlink_cli_time.tv_sec == 0 ||
@@ -1236,7 +1242,7 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     gpu_info->last_nvlink_cli_time = current_time;
   }
 
-  // Aggregate throughput: EMA smoothing (alpha = 0.3) on current value
+  // Aggregate throughput output
   if (gpu_info->cli_poll_active) {
     nvlink_info->has_throughput = true;
     nvlink_info->aggregate_tx = gpu_info->smoothed_agg_tx;

From fc9e548c8f7b865d4aabc72f55bc5c73a2d06965 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Fri, 1 May 2026 22:39:39 -0400
Subject: [PATCH 09/31] revert: remove EMA smoothing from NVLink throughput,
 prioritize raw accuracy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove Exponential Moving Average smoothing from the nvidia-smi CLI
throughput fallback path. Raw delta/time_delta is used directly without
smoothing — accuracy is more important than display smoothness for a
monitoring tool.
---
 src/extract_gpuinfo_nvidia.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index e8b4c3a1..c2aea806 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1232,6 +1232,7 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
           total_tx += cli_tx[link] - gpu_info->nvlink_cli_tx[link];
           total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
         }
+        // Raw rate (no smoothing — accuracy is more important than display smoothness)
         gpu_info->smoothed_agg_tx = (unsigned long long)((double)total_tx / delta_s);
         gpu_info->smoothed_agg_rx = (unsigned long long)((double)total_rx / delta_s);
       }
@@ -1242,7 +1243,7 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     gpu_info->last_nvlink_cli_time = current_time;
   }
 
-  // Aggregate throughput output
+  // Aggregate throughput: raw rate (no smoothing — accuracy over display smoothness)
   if (gpu_info->cli_poll_active) {
     nvlink_info->has_throughput = true;
     nvlink_info->aggregate_tx = gpu_info->smoothed_agg_tx;

From 4b1f9f2ebe8c233cfc3cfcd439386cbcbe026cc3 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 00:34:48 -0400
Subject: [PATCH 10/31] feat: increase NVLink max links from 18 to 36, hardcode
 2s CLI poll interval

- NVTOP_NVLINK_MAX_LINKS and NVML_NVLINK_MAX_LINKS_INTERNAL increased from 18 to 36
  for future-proof support of devices with up to 36 NVLink links.
- Add explicit comment: 2-second nvidia-smi CLI poll interval is hardcoded and
  independent of global refresh rate, minimizing resource usage for this resource-
  heavy process (full binary fork + text parsing).
- Update code comments with expanded field ID range (32-247 for links 0-35).
---
 include/nvtop/extract_gpuinfo_common.h |  2 +-
 src/extract_gpuinfo_nvidia.c           | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index f1019efb..aafa4899 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -241,7 +241,7 @@ inline unsigned busy_usage_from_time_usage_round(uint64_t current_use_ns, uint64
 unsigned nvtop_pcie_gen_from_link_speed(unsigned linkSpeed);
 
 // NVLink support
-#define NVTOP_NVLINK_MAX_LINKS 18
+#define NVTOP_NVLINK_MAX_LINKS 36
 
 struct nvlink_info {
   unsigned num_links;                 // Number of NVLink links on this device
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index c2aea806..32b49507 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -991,7 +991,7 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 }
 
 // NVML NVLink enums (defined locally since we don't have nvml.h)
-#define NVML_NVLINK_MAX_LINKS_INTERNAL 18
+#define NVML_NVLINK_MAX_LINKS_INTERNAL 36
 
 // NVML error counter types
 #define NVML_NVLINK_ERROR_DL_REPLAY   0
@@ -1021,7 +1021,7 @@ struct gpu_info_nvidia;
 // Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
 // Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
 // Phase 1: nvmlDeviceGetNvLinkErrorCounter for replay, recovery, CRC errors per link.
-// Phase 2: nvmlDeviceGetFieldValues for per-lane CRC flit corrections (field IDs 32-49 for links 0-17).
+// Phase 2: nvmlDeviceGetFieldValues for per-lane CRC flit corrections (field IDs 32-247 for links 0-35).
 static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info) {
   // Phase 1: error counters via nvmlDeviceGetNvLinkErrorCounter
   unsigned long long cumulative_errors = 0;
@@ -1045,7 +1045,7 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
   }
 
   // Phase 2: per-lane CRC corrections via nvmlDeviceGetFieldValues
-  // Field IDs: link 0 = 32-37, link 1 = 38-43, link 2 = 44-49, etc. (6 field IDs per link for lanes 0-5)
+  // Field IDs: link 0 = 32-37, link 1 = 38-43, link 2 = 44-49, etc. (6 field IDs per link for lanes 0-5, up to link 35 = 242-247)
   // The caller must populate fieldId in each nvmlFieldValue_t entry BEFORE calling;
   // the library populates the value fields on return.
   // nvmlFieldValue_t is 48 bytes: fieldId:u32(0), scopeId:u32(4), timestamp:u64(8),
@@ -1212,6 +1212,14 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
   // direct API call (zero process overhead). Keep this nvidia-smi CLI code
   // as a conditional fallback for consumer GPUs (RTX 3090, 3080 Ti) where
   // the NVML utilization counter is not exposed.
+  // Hardcoded 2-second CLI poll interval — independent of global nvtop refresh rate.
+  // nvidia-smi is a resource-heavy process (full binary fork + text parsing). This
+  // throttles the expensive popen/pclose calls to a maximum of one per 2 seconds,
+  // minimizing resource usage regardless of how fast the user sets the display refresh.
+  // A faster global refresh (e.g. 0.5s) would otherwise fork nvidia-smi far too often,
+  // degrading overall system performance. The delta-based rate computation
+  // (total_bytes / delta_s) normalizes to a per-second value, so the displayed
+  // throughput remains accurate even with a 2-second sample window.
   nvtop_time current_time;
   nvtop_get_current_time(&current_time);
   if (gpu_info->last_nvlink_cli_time.tv_sec == 0 ||

From 61a31dc7dc4682ac33d543ac27c06edf7f569a2d Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 07:52:24 -0400
Subject: [PATCH 11/31] fix: add missing delwin for
 shader_cores/l2_cache_size/exec_engines, plot_window, and unsigned underflow
 guard

- free_device_windows: delwin() for shader_cores, l2_cache_size,
  exec_engines (upstream PR #467 fix/memory-leaks-in-free-device-windows)
- delete_all_windows: delwin() for plots[i].plot_window
  (upstream PR #468 fix/plot-window-memory-leak)
- nvtop_get_nvlink_info: guard against unsigned underflow in CLI
  throughput delta if hardware counter wraps or resets
---
 src/extract_gpuinfo_nvidia.c |  9 +++++++--
 src/interface.c              | 12 ++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 32b49507..ef65ad24 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1237,8 +1237,13 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
         unsigned long long total_tx = 0, total_rx = 0;
         for (unsigned int link = 0; link < linkCount; link++) {
-          total_tx += cli_tx[link] - gpu_info->nvlink_cli_tx[link];
-          total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
+          // Guard against unsigned underflow if the hardware counter wraps or resets.
+          // If the new reading is less than the stored reading, skip this link to
+          // avoid a delta near ULLONG_MAX that would produce an absurd throughput spike.
+          if (cli_tx[link] >= gpu_info->nvlink_cli_tx[link])
+            total_tx += cli_tx[link] - gpu_info->nvlink_cli_tx[link];
+          if (cli_rx[link] >= gpu_info->nvlink_cli_rx[link])
+            total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
         }
         // Raw rate (no smoothing — accuracy is more important than display smoothness)
         gpu_info->smoothed_agg_tx = (unsigned long long)((double)total_tx / delta_s);
diff --git a/src/interface.c b/src/interface.c
index be175e2c..1dee6f31 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -252,6 +252,13 @@ static void free_device_windows(struct device_window *dwin) {
   delwin(dwin->temperature);
   delwin(dwin->fan_speed);
   delwin(dwin->pcie_info);
+  // Upstream bug: shader_cores, l2_cache_size, exec_engines were allocated with
+  // newwin() in alloc_device_window() but never freed here. This leaks 3 WINDOW
+  // structs per device on every delete_all_windows() call (terminal resize, GPU set change).
+  // Also fixed in upstream PR #467 (fix/memory-leaks-in-free-device-windows).
+  delwin(dwin->shader_cores);
+  delwin(dwin->l2_cache_size);
+  delwin(dwin->exec_engines);
   if (dwin->nvlink_info != NULL)
     delwin(dwin->nvlink_info);
   if (dwin->nvlink_errors != NULL)
@@ -459,6 +466,11 @@ static void delete_all_windows(struct nvtop_interface *dwin) {
   delwin(dwin->process.option_window.option_win);
   for (size_t i = 0; i < dwin->num_plots; ++i) {
     delwin(dwin->plots[i].win);
+    // Upstream bug: plot_window was allocated with newwin() in
+    // initialize_gpu_mem_plot() but never freed here. This leaks one WINDOW
+    // struct per chart on every delete_all_windows() call.
+    // Also fixed in upstream PR #468 (fix/plot-window-memory-leak).
+    delwin(dwin->plots[i].plot_window);
     free(dwin->plots[i].data);
   }
   free_setup_window(&dwin->setup_win);

From baa21a2ba21eaac3bc1b783b539e3623e9635bc3 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 08:10:28 -0400
Subject: [PATCH 12/31] fix: priority-2 code quality improvements

- print_data_at_scale: change parameter from unsigned int to unsigned long long
  to prevent 32-bit truncation on high-throughput NVLink hardware (e.g. B100/GB200)
- Remove duplicate #include <stdio.h> / #include <string.h> mid-file
  (already included at the top of extract_gpuinfo_nvidia.c)
- Remove redundant forward declaration of struct gpu_info_nvidia
  (struct already fully defined earlier in the same file)
- Remove unused NVM_LVALUE_VALUE_TYPE_OFF macro
---
 src/extract_gpuinfo_nvidia.c | 7 -------
 src/interface.c              | 8 +++++---
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index ef65ad24..b63976eb 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1007,16 +1007,9 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 // NOTE: nvmlFieldValue_t layout varies across NVML/driver versions.
 // Always verify against the header shipped with the driver you're targeting.
 #define NVM_LVALUE_FIELD_ID_OFF     0
-#define NVM_LVALUE_VALUE_TYPE_OFF   4
 #define NVM_LVALUE_UINT64_OFF       32
 #define NVM_LVALUE_SIZE             48
 
-#include <stdio.h>
-#include <string.h>
-
-// Forward declaration
-struct gpu_info_nvidia;
-
 // Read NVLink error counters and CRC corrections, storing results in the persistent gpu_info struct.
 // Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
 // Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
diff --git a/src/interface.c b/src/interface.c
index 1dee6f31..ff309f52 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -627,7 +627,7 @@ static void draw_temp_color(WINDOW *win, unsigned int temp, unsigned int temp_sl
   wnoutrefresh(win);
 }
 
-static void print_data_at_scale(WINDOW *win, unsigned int value) {
+static void print_data_at_scale(WINDOW *win, unsigned long long value) {
   int prefix_off;
   double val_d = value;
   for (prefix_off = 1; prefix_off < 6 && val_d >= 1000.; ++prefix_off) {
@@ -645,7 +645,9 @@ static void print_data_at_scale(WINDOW *win, unsigned int value) {
   wprintw(win, " %sB/s", memory_prefix[prefix_off]);
 }
 
-// Renamed from print_pcie_at_scale -> print_data_at_scale: reused for NVLink throughput (identical scale logic, bounds check extended to support TiB/s)
+// print_data_at_scale (renamed from print_pcie_at_scale): reused for NVLink throughput
+// (identical scale logic, bounds check extended to prefix_off < 6 for TiB/s).
+// Takes unsigned long long to avoid 32-bit truncation on high-throughput hardware.
 
 static inline void werase_and_wnoutrefresh(WINDOW *w) {
   werase(w);
@@ -944,7 +946,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
           wprintw(dev->nvlink_info, "%ux ", nvl_info.num_links);
 
         if (nvl_info.has_throughput) {
-          unsigned total_kib = (unsigned)(nvl_info.aggregate_tx + nvl_info.aggregate_rx);
+          unsigned long long total_kib = nvl_info.aggregate_tx + nvl_info.aggregate_rx;
           print_data_at_scale(dev->nvlink_info, total_kib);
         }
       }

From 737d4f08756776457376ce7f91cd625613cdbbf8 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 08:23:03 -0400
Subject: [PATCH 13/31] feat: cache NVLink link count and version to avoid
 re-probing every refresh cycle

- Add nvlink_cached_linkcount and nvlink_cached_version to
  struct gpu_info_nvidia (static hardware properties, probe once)
- Add nvlink_probe_and_cache() helper that probes all links and
  caches results on first call, returns cached value thereafter
- Replace inline probe loop in refresh_dynamic_info() with
  nvlink_probe_and_cache() call (also fixes hardcoded limit of 18
  -> now uses full NVML_NVLINK_MAX_LINKS_INTERNAL of 36)
- Replace inline probe loop in nvtop_get_nvlink_info() with
  nvlink_probe_and_cache() call, reads version from cache
- Eliminates up to 36 NVML API calls per GPU per refresh cycle
---
 src/extract_gpuinfo_nvidia.c | 84 +++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 31 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index b63976eb..7fdf63a0 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -301,6 +301,10 @@ struct gpu_info_nvidia {
   // Display-ready error/correction counts (computed in refresh_dynamic_info)
   unsigned long long display_errors; // Errors since nvtop launch
   unsigned long long display_corrections; // Corrections since nvtop launch
+
+  // Cached NVLink hardware properties (probe once, reuse forever)
+  unsigned int nvlink_cached_linkcount; // 0 = not yet probed
+  unsigned int nvlink_cached_version;   // Marketing version, 0 = not yet probed
 };
 
 static LIST_HEAD(allocations);
@@ -316,6 +320,13 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info);
 // Forward declaration for nvlink_read_errors (defined later, called from refresh_dynamic_info)
 static void nvlink_read_errors(struct nvmlDevice *device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info);
 
+// Remap raw NVML NVLink protocol version to the marketing version (forward declaration)
+static unsigned int nvlink_marketing_version(unsigned int raw_version);
+
+// Probe NVLink link count and version, caching results in gpu_info_nvidia to avoid
+// repeated NVML API calls on every refresh cycle. Returns cached linkCount (0 if no NVLink).
+unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info);
+
 struct gpu_vendor gpu_vendor_nvidia = {
     .init = gpuinfo_nvidia_init,
     .shutdown = gpuinfo_nvidia_shutdown,
@@ -788,17 +799,7 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
   // NVLink error counters (called here, not in nvtop_get_nvlink_info, to avoid the startup probe
   // establishing the baseline early and causing non-zero counters on first display refresh)
   if (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues) {
-    unsigned int linkCount = 0;
-    if (nvmlDeviceGetNvLinkState) {
-      for (unsigned int link = 0; link < 18; link++) {
-        unsigned int isActive = 0;
-        nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
-        if (ret == NVML_SUCCESS)
-          linkCount = link + 1;
-        else if (ret != NVML_ERROR_NOT_SUPPORTED)
-          break;
-      }
-    }
+    unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
     if (linkCount > 0)
       nvlink_read_errors(device, linkCount, gpu_info);
   }
@@ -1010,6 +1011,44 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 #define NVM_LVALUE_UINT64_OFF       32
 #define NVM_LVALUE_SIZE             48
 
+// Probe NVLink link count and version, caching results in gpu_info_nvidia to avoid
+// repeated NVML API calls on every refresh cycle. linkCount and version are static
+// hardware properties — once discovered, they never change during the process lifetime.
+// Returns the cached linkCount (0 if no NVLink).
+unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
+  // Already cached — return immediately
+  if (gpu_info->nvlink_cached_linkcount > 0)
+    return gpu_info->nvlink_cached_linkcount;
+
+  if (!nvmlDeviceGetNvLinkState)
+    return 0;
+
+  nvmlDevice_t device = gpu_info->gpuhandle;
+  unsigned int linkCount = 0;
+  unsigned int version = 0;
+  for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
+    unsigned int isActive = 0;
+    nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
+    if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
+      if (ret == NVML_SUCCESS) {
+        linkCount = link + 1;
+        // Read version on first link only (all links share the same version)
+        if (link == 0 && nvmlDeviceGetNvLinkVersion) {
+          nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version);
+          if (vret == NVML_SUCCESS)
+            version = nvlink_marketing_version(version);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+  // Cache results
+  gpu_info->nvlink_cached_linkcount = linkCount;
+  gpu_info->nvlink_cached_version = version;
+  return linkCount;
+}
+
 // Read NVLink error counters and CRC corrections, storing results in the persistent gpu_info struct.
 // Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
 // Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
@@ -1169,26 +1208,9 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
   if (!nvmlDeviceGetNvLinkState)
     return 0;
 
-  // Discover link count by probing each possible link
-  unsigned int linkCount = 0;
-  unsigned int version = 0;
-  for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
-    unsigned int isActive = 0;
-    nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
-    if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
-      if (ret == NVML_SUCCESS) {
-        linkCount = link + 1;
-        // Read version on first link only (all links share the same version)
-        if (link == 0 && nvmlDeviceGetNvLinkVersion) {
-          nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version);
-          if (vret == NVML_SUCCESS)
-            version = nvlink_marketing_version(version);
-        }
-      }
-    } else {
-      break;
-    }
-  }
+  // Use cached link count and version (probe once, reuse forever)
+  unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
+  unsigned int version = gpu_info->nvlink_cached_version;
 
   if (linkCount == 0)
     return 0;

From 6f1183da003adc04ff64e8922582e2767b13dbd1 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 08:30:15 -0400
Subject: [PATCH 14/31] perf: skip NVLink re-probing when already detected,
 with monitored-set-change reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add early return in nvtop_probe_nvlink_list() when
  any_device_has_nvlink is already true — NVLink support is a
  static hardware property, no need to re-probe every refresh cycle
- Reset any_device_has_nvlink in
  interface_check_monitored_gpu_change() when monitored set changes,
  so the user can switch between NVLink and non-NVLink GPUs without
  the cache becoming stale
---
 src/interface.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/interface.c b/src/interface.c
index ff309f52..d745f6d1 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -60,6 +60,11 @@ static void nvtop_adjust_field_sizes_for_nvlink(void) {
 }
 
 bool nvtop_probe_nvlink_list(struct list_head *devices) {
+  // Skip re-probing if we already know at least one device has NVLink.
+  // NVLink support is a static hardware property that does not change at runtime.
+  if (any_device_has_nvlink)
+    return true;
+
   struct gpu_info *gpu;
   list_for_each_entry(gpu, devices, list) {
     struct nvlink_info nvl;
@@ -2203,6 +2208,10 @@ void interface_check_monitored_gpu_change(struct nvtop_interface **interface, un
     nvtop_interface_option options_copy = (*interface)->options;
     options_copy.has_monitored_set_changed = false;
     memset(&(*interface)->options, 0, sizeof(options_copy));
+    // Reset NVLink probe cache when monitored device set changes — the user
+    // may have switched from an NVLink GPU to a non-NVLink one (or vice versa).
+    // The cache will be repopulated on the next refresh cycle.
+    any_device_has_nvlink = false;
     *num_monitored_gpus =
         interface_check_and_fix_monitored_gpus(allDevCount, monitoredGpus, nonMonitoredGpus, &options_copy);
     clean_ncurses(*interface);

From 1d11cda17f695f7152914e98fa255dd415708bf2 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 09:04:48 -0400
Subject: [PATCH 15/31] perf: cache full nvlink_info struct in refresh path,
 optimize draw path for non-hot-swap hardware

---
 include/nvtop/extract_gpuinfo_common.h |   4 +
 src/extract_gpuinfo_nvidia.c           | 133 ++++++++++++++++++-------
 src/interface.c                        |   5 +
 3 files changed, 105 insertions(+), 37 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index aafa4899..bd1fe606 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -266,4 +266,8 @@ bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
 bool nvtop_probe_nvlink_list(struct list_head *devices);
 void nvtop_set_nvlink_probe(bool val);
 
+// Reset per-GPU NVLink cache (probed flag, cached linkcount/version, cached info struct).
+// Call when the monitored device set changes so newly-monitored NVLink GPUs get probed fresh.
+void nvtop_reset_nvlink_cache(struct gpu_info *gpu_info);
+
 #endif // EXTRACT_GPUINFO_COMMON_H__
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 7fdf63a0..7c5ad927 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -303,8 +303,15 @@ struct gpu_info_nvidia {
   unsigned long long display_corrections; // Corrections since nvtop launch
 
   // Cached NVLink hardware properties (probe once, reuse forever)
-  unsigned int nvlink_cached_linkcount; // 0 = not yet probed
+  bool nvlink_probed; // true after first probe, regardless of result
+  unsigned int nvlink_cached_linkcount; // 0 = no NVLink links
   unsigned int nvlink_cached_version;   // Marketing version, 0 = not yet probed
+
+  // Cached nvlink_info struct: populated during refresh_dynamic_info,
+  // returned by nvtop_get_nvlink_info in the draw path to avoid redundant
+  // NVML calls and CLI forks on every draw cycle.
+  struct nvlink_info cached_nvlink_info;
+  bool cached_nvlink_info_populated;
 };
 
 static LIST_HEAD(allocations);
@@ -320,6 +327,10 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info);
 // Forward declaration for nvlink_read_errors (defined later, called from refresh_dynamic_info)
 static void nvlink_read_errors(struct nvmlDevice *device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info);
 
+// Forward declaration for nvlink_refresh_cached_info (defined later, called from refresh_dynamic_info)
+// Populates gpu_info->cached_nvlink_info with throughput + error data.
+static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigned int linkCount);
+
 // Remap raw NVML NVLink protocol version to the marketing version (forward declaration)
 static unsigned int nvlink_marketing_version(unsigned int raw_version);
 
@@ -796,12 +807,19 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
     }
   }
 
-  // NVLink error counters (called here, not in nvtop_get_nvlink_info, to avoid the startup probe
-  // establishing the baseline early and causing non-zero counters on first display refresh)
-  if (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues) {
+  // NVLink: refresh error counters, throughput, and populate cached nvlink_info
+  // GPUs are non-hot-swappable — all NVLink probing/computation happens here
+  // (refresh path), and nvtop_get_nvlink_info() just returns the cached copy
+  // in the draw path.
+  if (nvmlDeviceGetNvLinkState) {
     unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
-    if (linkCount > 0)
-      nvlink_read_errors(device, linkCount, gpu_info);
+    if (linkCount > 0) {
+      // Error counters (dynamic — must refresh every cycle)
+      if (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues)
+        nvlink_read_errors(device, linkCount, gpu_info);
+      // Throughput + cached info struct
+      nvlink_refresh_cached_info(gpu_info, linkCount);
+    }
   }
 }
 
@@ -1016,12 +1034,14 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
 // hardware properties — once discovered, they never change during the process lifetime.
 // Returns the cached linkCount (0 if no NVLink).
 unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
-  // Already cached — return immediately
-  if (gpu_info->nvlink_cached_linkcount > 0)
+  // Already probed — return cached result (even if linkcount is 0)
+  if (gpu_info->nvlink_probed)
     return gpu_info->nvlink_cached_linkcount;
 
-  if (!nvmlDeviceGetNvLinkState)
+  if (!nvmlDeviceGetNvLinkState) {
+    gpu_info->nvlink_probed = true;
     return 0;
+  }
 
   nvmlDevice_t device = gpu_info->gpuhandle;
   unsigned int linkCount = 0;
@@ -1044,6 +1064,7 @@ unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
     }
   }
   // Cache results
+  gpu_info->nvlink_probed = true;
   gpu_info->nvlink_cached_linkcount = linkCount;
   gpu_info->nvlink_cached_version = version;
   return linkCount;
@@ -1196,28 +1217,16 @@ static unsigned int nvlink_marketing_version(unsigned int raw_version) {
 
 // Get NVLink info (version, link count, aggregate throughput via CLI).
 // Designed for consumer GPUs (RTX 3090) where NVML utilization counters are unavailable.
-unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *nvlink_info) {
-  if (!_gpu_info || !nvlink_info)
-    return 0;
-
-  struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
-  nvmlDevice_t device = gpu_info->gpuhandle;
+// Populate cached_nvlink_info with link count, version, throughput, and error counts.
+// Called from refresh_dynamic_info on every refresh cycle (refresh path).
+// GPUs are non-hot-swappable, so all NVLink data is computed here and cached —
+// nvtop_get_nvlink_info() in the draw path just returns the cached copy.
+static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigned int linkCount) {
+  struct nvlink_info *cache = &gpu_info->cached_nvlink_info;
 
-  memset(nvlink_info, 0, sizeof(*nvlink_info));
-
-  if (!nvmlDeviceGetNvLinkState)
-    return 0;
-
-  // Use cached link count and version (probe once, reuse forever)
-  unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
-  unsigned int version = gpu_info->nvlink_cached_version;
-
-  if (linkCount == 0)
-    return 0;
-
-  nvlink_info->supported = true;
-  nvlink_info->num_links = linkCount;
-  nvlink_info->version = version;
+  cache->supported = true;
+  cache->num_links = linkCount;
+  cache->version = gpu_info->nvlink_cached_version;
 
   // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs).
   // Poll every 2 seconds to keep CPU overhead low.
@@ -1271,16 +1280,66 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
     gpu_info->last_nvlink_cli_time = current_time;
   }
 
-  // Aggregate throughput: raw rate (no smoothing — accuracy over display smoothness)
+  // Aggregate throughput
   if (gpu_info->cli_poll_active) {
-    nvlink_info->has_throughput = true;
-    nvlink_info->aggregate_tx = gpu_info->smoothed_agg_tx;
-    nvlink_info->aggregate_rx = gpu_info->smoothed_agg_rx;
+    cache->has_throughput = true;
+    cache->aggregate_tx = gpu_info->smoothed_agg_tx;
+    cache->aggregate_rx = gpu_info->smoothed_agg_rx;
+  } else {
+    cache->has_throughput = false;
+    cache->aggregate_tx = 0;
+    cache->aggregate_rx = 0;
   }
 
-  // Error counters are read separately via nvlink_read_errors() called from
-  // the display loop (draw_devices). Do NOT call here to avoid the startup
-  // probe (nvtop_probe_nvlink_list) from establishing the baseline early.
+  // Error/correction counts from display-ready fields (populated by nvlink_read_errors)
+  cache->total_errors = gpu_info->display_errors;
+  cache->total_corrections = gpu_info->display_corrections;
+
+  gpu_info->cached_nvlink_info_populated = true;
+}
+
+// Return cached nvlink_info struct. Called from the draw path (draw_gpu_info_ncurses)
+// to avoid redundant NVML calls and CLI forks on every draw cycle.
+// GPUs are non-hot-swappable, so the cached struct is authoritative.
+// For the startup probe (nvtop_probe_nvlink_list) before refresh_dynamic_info has run,
+// falls back to computing on-demand.
+unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *nvlink_info) {
+  if (!_gpu_info || !nvlink_info)
+    return 0;
+
+  struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
+
+  // If cached info is available (after first refresh), just return it.
+  // This is the fast path — eliminates all NVML calls and CLI forks in the draw path.
+  if (gpu_info->cached_nvlink_info_populated) {
+    memcpy(nvlink_info, &gpu_info->cached_nvlink_info, sizeof(*nvlink_info));
+    return nvlink_info->num_links;
+  }
+
+  // Fallback for startup probe (nvtop_probe_nvlink_list) before refresh_dynamic_info ran:
+  // Populate minimal info (link count + version, no throughput) to determine if NVLink exists.
+  if (!nvmlDeviceGetNvLinkState)
+    return 0;
+
+  memset(nvlink_info, 0, sizeof(*nvlink_info));
+
+  unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
+  if (linkCount == 0)
+    return 0;
+
+  nvlink_info->supported = true;
+  nvlink_info->num_links = linkCount;
+  nvlink_info->version = gpu_info->nvlink_cached_version;
 
   return nvlink_info->num_links;
 }
+
+// Reset all NVLink caches for a single GPU. Called when monitored device set changes.
+void nvtop_reset_nvlink_cache(struct gpu_info *_gpu_info) {
+  struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
+  gpu_info->nvlink_probed = false;
+  gpu_info->nvlink_cached_linkcount = 0;
+  gpu_info->nvlink_cached_version = 0;
+  gpu_info->cached_nvlink_info_populated = false;
+  memset(&gpu_info->cached_nvlink_info, 0, sizeof(gpu_info->cached_nvlink_info));
+}
diff --git a/src/interface.c b/src/interface.c
index d745f6d1..769d0093 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -2212,6 +2212,11 @@ void interface_check_monitored_gpu_change(struct nvtop_interface **interface, un
     // may have switched from an NVLink GPU to a non-NVLink one (or vice versa).
     // The cache will be repopulated on the next refresh cycle.
     any_device_has_nvlink = false;
+    // Reset NVLink probes on all monitored GPUs so they get probed fresh.
+    { struct gpu_info *g;
+      list_for_each_entry(g, monitoredGpus, list)
+        nvtop_reset_nvlink_cache(g);
+    }
     *num_monitored_gpus =
         interface_check_and_fix_monitored_gpus(allDevCount, monitoredGpus, nonMonitoredGpus, &options_copy);
     clean_ncurses(*interface);

From b480c24bec6c1cb3bc6c11aaa6740a15a1257aef Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sat, 2 May 2026 11:37:05 -0400
Subject: [PATCH 16/31] feat: add NVLink 6.0 (Rubin) version mapping for
 future-proofing

Add case 8: return 6 to nvlink_marketing_version() to handle
NVLink 6.0 raw NVML enum value from NVIDIA Rubin platform.
Also adds descriptive comments to existing version mapping cases.
---
 src/extract_gpuinfo_nvidia.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 7c5ad927..b1f1544e 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1209,8 +1209,9 @@ static unsigned int nvlink_marketing_version(unsigned int raw_version) {
     case 3: return 2;  // NVLink 2.2 -> 2
     case 4: return 3;  // NVLink 3.0 -> 3
     case 5: return 3;  // NVLink 3.1 -> 3
-    case 6: return 4;
-    case 7: return 5;
+    case 6: return 4;  // NVLink 4.0
+    case 7: return 5;  // NVLink 5.0
+    case 8: return 6;  // NVLink 6.0 (Rubin)
     default: return raw_version;
   }
 }

From 007c4eeea2766cb8ff87eb4221b6144b1d8cbf9d Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 10:46:54 -0400
Subject: [PATCH 17/31] feat: detect and display NVLink-supported GPUs with 0
 active links

Probe NVLink version before link state loop so "supported but no
bridge" is detected. Display shows "NVL3 0x" for GPUs with NVLink
hardware but no bridge connected. Layout compaction only applies
when active links are present (0-link display needs no padding
reduction). Adds any_device_has_nvlink_active flag to distinguish
NVLink hardware support from active connections.
---
 src/extract_gpuinfo_nvidia.c | 55 ++++++++++++++++++---------
 src/interface.c              | 72 ++++++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index b1f1544e..9f37455e 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -811,13 +811,15 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
   // GPUs are non-hot-swappable — all NVLink probing/computation happens here
   // (refresh path), and nvtop_get_nvlink_info() just returns the cached copy
   // in the draw path.
+  // "supported but no bridge" case: version is probed before link state, so
+  // cached_version > 0 means NVLink hardware exists even with linkCount == 0.
   if (nvmlDeviceGetNvLinkState) {
     unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
-    if (linkCount > 0) {
-      // Error counters (dynamic — must refresh every cycle)
-      if (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues)
+    if (linkCount > 0 || gpu_info->nvlink_cached_version > 0) {
+      // Error counters only make sense when links are active.
+      if (linkCount > 0 && (nvmlDeviceGetNvLinkErrorCounter || nvmlDeviceGetFieldValues))
         nvlink_read_errors(device, linkCount, gpu_info);
-      // Throughput + cached info struct
+      // Throughput + cached info struct (handles 0-link case for display)
       nvlink_refresh_cached_info(gpu_info, linkCount);
     }
   }
@@ -1046,19 +1048,24 @@ unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
   nvmlDevice_t device = gpu_info->gpuhandle;
   unsigned int linkCount = 0;
   unsigned int version = 0;
+
+  // Probe NVLink version BEFORE the link state loop. This succeeds on any GPU
+  // with NVLink hardware, even when no bridge is connected (all links return
+  // NVML_ERROR_NOT_SUPPORTED from GetNvLinkState). This lets us detect
+  // "NVLink supported but no active links" vs "no NVLink hardware at all."
+  if (nvmlDeviceGetNvLinkVersion) {
+    nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version);
+    if (vret == NVML_SUCCESS)
+      version = nvlink_marketing_version(version);
+  }
+
+  // Probe active links
   for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
     unsigned int isActive = 0;
     nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
     if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
-      if (ret == NVML_SUCCESS) {
+      if (ret == NVML_SUCCESS)
         linkCount = link + 1;
-        // Read version on first link only (all links share the same version)
-        if (link == 0 && nvmlDeviceGetNvLinkVersion) {
-          nvmlReturn_t vret = nvmlDeviceGetNvLinkVersion(device, 0, &version);
-          if (vret == NVML_SUCCESS)
-            version = nvlink_marketing_version(version);
-        }
-      }
     } else {
       break;
     }
@@ -1229,6 +1236,17 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
   cache->num_links = linkCount;
   cache->version = gpu_info->nvlink_cached_version;
 
+  // Throughput: skip entirely when there are 0 links (nothing to measure).
+  if (linkCount == 0) {
+    cache->has_throughput = false;
+    cache->aggregate_tx = 0;
+    cache->aggregate_rx = 0;
+    cache->total_errors = 0;
+    cache->total_corrections = 0;
+    gpu_info->cached_nvlink_info_populated = true;
+    return;
+  }
+
   // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs).
   // Poll every 2 seconds to keep CPU overhead low.
   //
@@ -1319,18 +1337,21 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
   // Fallback for startup probe (nvtop_probe_nvlink_list) before refresh_dynamic_info ran:
   // Populate minimal info (link count + version, no throughput) to determine if NVLink exists.
+  // "supported but no bridge" case: version probed before link state, so set supported=true
+  // even when linkCount == 0 if we got a version reading.
   if (!nvmlDeviceGetNvLinkState)
     return 0;
 
   memset(nvlink_info, 0, sizeof(*nvlink_info));
 
   unsigned int linkCount = nvlink_probe_and_cache(gpu_info);
-  if (linkCount == 0)
-    return 0;
 
-  nvlink_info->supported = true;
-  nvlink_info->num_links = linkCount;
-  nvlink_info->version = gpu_info->nvlink_cached_version;
+  if (gpu_info->nvlink_cached_version > 0) {
+    // NVLink hardware detected (version read succeeded), even if no links active.
+    nvlink_info->supported = true;
+    nvlink_info->num_links = linkCount;
+    nvlink_info->version = gpu_info->nvlink_cached_version;
+  }
 
   return nvlink_info->num_links;
 }
diff --git a/src/interface.c b/src/interface.c
index 769d0093..f7a01b78 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -49,12 +49,19 @@ static unsigned int sizeof_device_field[device_field_count] = {
     [device_nvlink_errors] = 19,
 };
 
-// True if any monitored device has NVLink — set before layout is computed
+// True if any monitored device has NVLink hardware support (even if 0 links active).
+// Controls whether to allocate the nvlink_info window for displaying "NVL3 0x" etc.
 static bool any_device_has_nvlink = false;
-
-// When NVLink is present, shrink fan field from 11 to 8 to make room on line 2
+// True if any monitored device has NVLink with active links (linkCount > 0).
+// Controls layout adjustments (shrinking fan field, adjusting line 2 width)
+// and the nvlink_errors window allocation.
+static bool any_device_has_nvlink_active = false;
+
+// When NVLink has ACTIVE links, shrink fan field from 11 to 8 to make room on line 2.
+// Only done when there are actual links to show throughput for — 0-link "NVL3 0x"
+// display does not require any padding reduction.
 static void nvtop_adjust_field_sizes_for_nvlink(void) {
-  if (any_device_has_nvlink) {
+  if (any_device_has_nvlink_active) {
     sizeof_device_field[device_fan_speed] = 8;  // "FAN %3u%%" (was 11 with padding)
   }
 }
@@ -65,18 +72,33 @@ bool nvtop_probe_nvlink_list(struct list_head *devices) {
   if (any_device_has_nvlink)
     return true;
 
+  bool has_nvlink = false;
+  bool has_nvlink_active = false;
+
   struct gpu_info *gpu;
   list_for_each_entry(gpu, devices, list) {
     struct nvlink_info nvl;
     memset(&nvl, 0, sizeof(nvl));
-    if (nvtop_get_nvlink_info(gpu, &nvl) > 0 && nvl.supported)
-      return true;
+    // nvtop_get_nvlink_info returns num_links (could be 0 for "supported but no bridge").
+    // Check nvl.supported separately to catch the 0-link case.
+    nvtop_get_nvlink_info(gpu, &nvl);
+    if (nvl.supported) {
+      has_nvlink = true;
+      if (nvl.num_links > 0)
+        has_nvlink_active = true;
+    }
   }
-  return false;
+
+  any_device_has_nvlink = has_nvlink;
+  any_device_has_nvlink_active = has_nvlink_active;
+  nvtop_adjust_field_sizes_for_nvlink();
+
+  return has_nvlink;
 }
 
 void nvtop_set_nvlink_probe(bool val) {
   any_device_has_nvlink = val;
+  any_device_has_nvlink_active = val;
   nvtop_adjust_field_sizes_for_nvlink();
 }
 
@@ -222,7 +244,8 @@ static void alloc_device_window(unsigned int start_row, unsigned int start_col,
   if (dwin->exec_engines == NULL)
     goto alloc_error;
   // NVLink errors appended to exec_engines on the same row (start_row + 3), conditional on NVLink
-  if (any_device_has_nvlink) {
+  // Only allocate for devices with active links — 0-link devices have no error counters to show.
+  if (any_device_has_nvlink_active) {
     dwin->nvlink_errors =
         newwin(1, sizeof_device_field[device_nvlink_errors], start_row + 3,
                start_col + spacer * 3 + sizeof_device_field[device_shadercores] +
@@ -410,14 +433,16 @@ static void alloc_plot_window(unsigned devices_count, struct window_position *pl
 }
 
 static unsigned device_length(void) {
-  // When no NVLink anywhere, match original repo layout exactly
-  if (!any_device_has_nvlink) {
+  // When no NVLink with active links anywhere, match original repo layout exactly.
+  // 0-link "NVL3 0x" display doesn't need the +2 width adjustment — only throughput
+  // display with compaction needs it.
+  if (!any_device_has_nvlink_active) {
     return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
                sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                    sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
                    sizeof_device_field[device_power] + 5);
   }
-  // With NVLink: keep line 3 at original width (+3 compensates for fan 11->8, power stays 15)
+  // With NVLink active links: keep line 3 at original width (+3 compensates for fan 11->8, power stays 15)
   return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
              sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                  sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
@@ -935,8 +960,8 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
     if (dev->nvlink_info != NULL) {
       werase(dev->nvlink_info);
       struct nvlink_info nvl_info;
-      unsigned nvlinks = nvtop_get_nvlink_info(device, &nvl_info);
-      if (nvlinks > 0 && nvl_info.supported) {
+      nvtop_get_nvlink_info(device, &nvl_info);
+      if (nvl_info.supported) {
         wcolor_set(dev->nvlink_info, cyan_color, NULL);
         wprintw(dev->nvlink_info, "NVL");
         wcolor_set(dev->nvlink_info, magenta_color, NULL);
@@ -945,14 +970,21 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
         else
           wprintw(dev->nvlink_info, "?");
         wstandend(dev->nvlink_info);
-        if (nvl_info.num_links < 10)
-          wprintw(dev->nvlink_info, " %ux ", nvl_info.num_links);
-        else
-          wprintw(dev->nvlink_info, "%ux ", nvl_info.num_links);
 
-        if (nvl_info.has_throughput) {
-          unsigned long long total_kib = nvl_info.aggregate_tx + nvl_info.aggregate_rx;
-          print_data_at_scale(dev->nvlink_info, total_kib);
+        if (nvl_info.num_links > 0) {
+          // Active links: show link count and throughput
+          if (nvl_info.num_links < 10)
+            wprintw(dev->nvlink_info, " %ux ", nvl_info.num_links);
+          else
+            wprintw(dev->nvlink_info, "%ux ", nvl_info.num_links);
+
+          if (nvl_info.has_throughput) {
+            unsigned long long total_kib = nvl_info.aggregate_tx + nvl_info.aggregate_rx;
+            print_data_at_scale(dev->nvlink_info, total_kib);
+          }
+        } else {
+          // No active links (no bridge connected) — show "0x"
+          wprintw(dev->nvlink_info, " 0x");
         }
       }
       wnoutrefresh(dev->nvlink_info);

From 4e2ff5668ae278c172314b519c35d2594bed7e99 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 11:18:23 -0400
Subject: [PATCH 18/31] fix: count only active NVLink links, not physical slots
 with inactive bridges

---
 src/extract_gpuinfo_nvidia.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index 9f37455e..c82e052f 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1059,14 +1059,21 @@ unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
       version = nvlink_marketing_version(version);
   }
 
-  // Probe active links
+  // Probe links. A link is counted only if nvmlDeviceGetNvLinkState succeeds
+  // AND isActive == 1. Without a bridge, the API returns SUCCESS with isActive=0
+  // for all physical link slots — those must NOT be counted.
+  // Consume links must be contiguous from 0: we stop at the first inactive link
+  // (either isActive=0 or API error) to avoid reporting phantom counts.
   for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS_INTERNAL; link++) {
     unsigned int isActive = 0;
     nvmlReturn_t ret = nvmlDeviceGetNvLinkState(device, link, &isActive);
-    if (ret == NVML_SUCCESS || ret == NVML_ERROR_NOT_SUPPORTED) {
-      if (ret == NVML_SUCCESS)
-        linkCount = link + 1;
+    if (ret == NVML_SUCCESS && isActive) {
+      linkCount = link + 1;
+    } else if (ret == NVML_ERROR_NOT_SUPPORTED) {
+      // This link slot does not exist on this hardware — stop probing.
+      break;
     } else {
+      // ret != SUCCESS, or isActive == 0: no more active links.
       break;
     }
   }

From 08ca032f2df47f6cfd5f765fd0532cd661230233 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 11:35:15 -0400
Subject: [PATCH 19/31] fix: account for NVLink window width in device_length()
 even with 0 active links

When NVLink is supported but no bridge connected, the NVLink info window
is still allocated on line 2 (displaying 'NVL3 0x'). The old code only
expanded the panel width when links were active, causing the NVLink
window to overflow the panel boundary in the 0-link case.

Now device_length() checks any_device_has_nvlink (not
any_device_has_nvlink_active) to include the NVLink window width in the
panel calculation. Fan field padding (11 chars) is preserved since no
throughput display is needed.
---
 src/interface.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index f7a01b78..cfbccbb1 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -433,20 +433,24 @@ static void alloc_plot_window(unsigned devices_count, struct window_position *pl
 }
 
 static unsigned device_length(void) {
-  // When no NVLink with active links anywhere, match original repo layout exactly.
-  // 0-link "NVL3 0x" display doesn't need the +2 width adjustment — only throughput
-  // display with compaction needs it.
-  if (!any_device_has_nvlink_active) {
-    return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
-               sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
+  unsigned line1 = sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1;
+
+  // Line 2 base: clock, mem_clock, temp, fan, power + spacers (4 spacers + 1 = 5)
+  unsigned line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                    sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
-                   sizeof_device_field[device_power] + 5);
+                   sizeof_device_field[device_power] + 5;
+
+  if (any_device_has_nvlink) {
+    // NVLink window appended after power field on line 2. Its right edge is:
+    // start_col + clock + mem_clock + temp + fan + pcie + 3
+    // (NVLink window starts after spacer*2 + power, width = pcie - power - spacer*3)
+    // This covers both active links (with fan compaction) and 0-link "NVL3 0x" display.
+    line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
+            sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
+            sizeof_device_field[device_pcie] + 3;
   }
-  // With NVLink active links: keep line 3 at original width (+3 compensates for fan 11->8, power stays 15)
-  return max(sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1,
-             sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
-                 sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
-                 sizeof_device_field[device_power] + 5 + 2);
+
+  return max(line1, line2);
 }
 
 static pid_t nvtop_pid;

From a527583b973fc3f5577d5ce872b7db8af9ebcc81 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 11:43:02 -0400
Subject: [PATCH 20/31] Fix: nvtop_set_nvlink_probe() overwrites
 any_device_has_nvlink_active

nvtop_probe_nvlink_list() correctly sets any_device_has_nvlink_active=false
when NVLink hardware is present but no links are active. But nvtop_set_nvlink_probe()
then blindly overwrites it with the return value (true), destroying the distinction.

This caused fan field to shrink to 8 chars even with 0 active links, making
line 3 bar charts (GPU/MEM/Enc/Dec) expand incorrectly.
---
 src/interface.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index cfbccbb1..a1041e12 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -98,8 +98,9 @@ bool nvtop_probe_nvlink_list(struct list_head *devices) {
 
 void nvtop_set_nvlink_probe(bool val) {
   any_device_has_nvlink = val;
-  any_device_has_nvlink_active = val;
-  nvtop_adjust_field_sizes_for_nvlink();
+  // Do NOT touch any_device_has_nvlink_active — it was already set correctly
+  // inside nvtop_probe_nvlink_list() with the proper distinction between
+  // "hardware present" and "links active".
 }
 
 static unsigned int sizeof_process_field[process_field_count] = {

From 344e5a68952498d3b2432cdc8df1c63ab9a71fab Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 11:58:10 -0400
Subject: [PATCH 21/31] Fix: device_length() should not expand panel for NVLink
 0-link case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Panel width (device_length) controls all rows including line 3 bar charts.
Expanding it for the 0-link case was making GPU/MEM/Enc/Dec bars too wide.

Now panel width only expands when any_device_has_nvlink_active (actual links
with throughput to display). For the 0-link "NVL3 0x" case, the NVLink window
extends past the nominal panel edge which is fine — ncurses handles overlapping
windows correctly and line 3 bars stay at proper width.
---
 src/interface.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index a1041e12..28974801 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -441,11 +441,10 @@ static unsigned device_length(void) {
                    sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
                    sizeof_device_field[device_power] + 5;
 
-  if (any_device_has_nvlink) {
-    // NVLink window appended after power field on line 2. Its right edge is:
-    // start_col + clock + mem_clock + temp + fan + pcie + 3
-    // (NVLink window starts after spacer*2 + power, width = pcie - power - spacer*3)
-    // This covers both active links (with fan compaction) and 0-link "NVL3 0x" display.
+  if (any_device_has_nvlink_active) {
+    // Only expand panel when NVLink has active links to show throughput.
+    // For 0-link case ("NVL3 0x"), the NVLink window can extend past the
+    // nominal panel edge — it won't affect line 3 bar charts.
     line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
             sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
             sizeof_device_field[device_pcie] + 3;

From 4c29a4ac214e841f2fa4b49725b425284a6b0ed2 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 12:03:28 -0400
Subject: [PATCH 22/31] Fix: fan display format should use
 any_device_has_nvlink_active, not any_device_has_nvlink

For NVLink-supported GPUs with 0 active links (no bridge), the fan field was
using compact format ("FAN %3u%%") instead of the upstream padded format
(" FAN %3u%%  "). Changed all three fan format conditionals from
any_device_has_nvlink to any_device_has_nvlink_active so the 0-link case
preserves the standard spacing and field width.
---
 src/interface.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index 28974801..231fc605 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -886,7 +886,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
     // FAN
     if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_speed)) {
-      if (any_device_has_nvlink) {
+      if (any_device_has_nvlink_active) {
         mvwprintw(dev->fan_speed, 0, 0, "FAN %3u%%",
                   device->dynamic_info.fan_speed > 100 ? 100 : device->dynamic_info.fan_speed);
         mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
@@ -896,7 +896,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
         mvwchgat(dev->fan_speed, 0, 1, 3, 0, cyan_color, NULL);
       }
     } else if (device->static_info.integrated_graphics) {
-      if (any_device_has_nvlink) {
+      if (any_device_has_nvlink_active) {
         mvwprintw(dev->fan_speed, 0, 0, "CPU-FAN");
         mvwchgat(dev->fan_speed, 0, 0, 7, 0, cyan_color, NULL);
       } else {
@@ -904,7 +904,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
         mvwchgat(dev->fan_speed, 0, 2, 7, 0, cyan_color, NULL);
       }
     } else if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, fan_rpm)) {
-      if (any_device_has_nvlink) {
+      if (any_device_has_nvlink_active) {
         mvwprintw(dev->fan_speed, 0, 0, "FAN%3uR",
                   device->dynamic_info.fan_rpm > 999 ? 999 : device->dynamic_info.fan_rpm);
         mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);

From 7b4987a059713bd73545664769cb0994ee22babf Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 12:51:05 -0400
Subject: [PATCH 23/31] fix: guard NVLink functions with vendor check to avoid
 container_of on non-NVIDIA GPUs

nvtop_get_nvlink_info(), nvtop_get_nvlink_error_counts(), and
nvtop_reset_nvlink_cache() use container_of() to cast gpu_info to
gpu_info_nvidia. On a non-NVIDIA device this is undefined behavior.

Add a strcmp() guard at the top of each function to return early
for non-NVIDIA GPUs. This avoids the unsafe cast entirely and makes
the code correct for mixed-vendor or NVIDIA-free systems.
---
 src/extract_gpuinfo_nvidia.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index c82e052f..c50ecd02 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -1165,6 +1165,10 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
 bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
                                     unsigned long long *out_errors,
                                     unsigned long long *out_corrections) {
+  // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately
+  if (strcmp(_gpu_info->vendor->name, "NVIDIA"))
+    return false;
+
   struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
   if (!gpu_info->nvlink_errors_baseline_read) {
     return false;
@@ -1333,6 +1337,12 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
   if (!_gpu_info || !nvlink_info)
     return 0;
 
+  // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately
+  if (strcmp(_gpu_info->vendor->name, "NVIDIA")) {
+    memset(nvlink_info, 0, sizeof(*nvlink_info));
+    return 0;
+  }
+
   struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
 
   // If cached info is available (after first refresh), just return it.
@@ -1365,6 +1375,10 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *_gpu_info, struct nvlink_info *n
 
 // Reset all NVLink caches for a single GPU. Called when monitored device set changes.
 void nvtop_reset_nvlink_cache(struct gpu_info *_gpu_info) {
+  // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately
+  if (strcmp(_gpu_info->vendor->name, "NVIDIA"))
+    return;
+
   struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base);
   gpu_info->nvlink_probed = false;
   gpu_info->nvlink_cached_linkcount = 0;

From 139d585021a5a711fffb362d462e17a39359199d Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 14:02:58 -0400
Subject: [PATCH 24/31] fix: device_length() should not expand panel for NVLink
 active links
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Line 3 bar charts (GPU/MEM/Enc/Dec) were 6 chars too wide with NVLink
bridge installed. device_length() expanded panel to 90 (line2 with
pcie field) instead of 84 (base layout). NVLink window on line 2 can
extend past nominal panel edge — ncurses handles it fine, same as the
0-link case. Reverting to base layout keeps line 3 bar charts at the
correct width.
---
 src/interface.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index 231fc605..e93be7ff 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -437,19 +437,14 @@ static unsigned device_length(void) {
   unsigned line1 = sizeof_device_field[device_name] + sizeof_device_field[device_pcie] + 1;
 
   // Line 2 base: clock, mem_clock, temp, fan, power + spacers (4 spacers + 1 = 5)
+  // Do NOT expand for NVLink — the NVLink window on line 2 extends past the
+  // nominal panel edge and ncurses renders it fine. Expanding it would make
+  // line 3 bar charts (GPU/MEM/Enc/Dec) too wide. This applies to both the
+  // 0-link case ("NVL3 0x") and the active-links case (with throughput).
   unsigned line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
                    sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
                    sizeof_device_field[device_power] + 5;
 
-  if (any_device_has_nvlink_active) {
-    // Only expand panel when NVLink has active links to show throughput.
-    // For 0-link case ("NVL3 0x"), the NVLink window can extend past the
-    // nominal panel edge — it won't affect line 3 bar charts.
-    line2 = sizeof_device_field[device_clock] + sizeof_device_field[device_mem_clock] +
-            sizeof_device_field[device_temperature] + sizeof_device_field[device_fan_speed] +
-            sizeof_device_field[device_pcie] + 3;
-  }
-
   return max(line1, line2);
 }
 

From 50d51b7e7d345513d0d00dd8efe117a0ffb26c46 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 14:33:06 -0400
Subject: [PATCH 25/31] fix: two-tier flag consistency and monitored-set-change
 state reset

Three related fixes:

1. fan N/A fallback (line 912) uses any_device_has_nvlink_active instead
   of any_device_has_nvlink for consistent layout compaction.

2. Reset any_device_has_nvlink_active in interface_check_monitored_gpu_change()
   alongside any_device_has_nvlink to prevent stale flags from causing
   incorrect nvlink_errors window allocation during window rebuild.

3. Reset fan field width to 11 in interface_check_monitored_gpu_change()
   so initialize_curses() allocates fan_speed windows at the correct
   default width after a monitored-set change.
---
 src/interface.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/interface.c b/src/interface.c
index e93be7ff..5988114e 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -909,7 +909,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
         mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
       }
     } else {
-      if (any_device_has_nvlink) {
+      if (any_device_has_nvlink_active) {
         mvwprintw(dev->fan_speed, 0, 0, "FAN N/A");
         mvwchgat(dev->fan_speed, 0, 0, 3, 0, cyan_color, NULL);
       } else {
@@ -2243,6 +2243,11 @@ void interface_check_monitored_gpu_change(struct nvtop_interface **interface, un
     // may have switched from an NVLink GPU to a non-NVLink one (or vice versa).
     // The cache will be repopulated on the next refresh cycle.
     any_device_has_nvlink = false;
+    any_device_has_nvlink_active = false;
+    // Reset fan field to default width — it may have been compacted to 8 for
+    // NVLink-active layout. Without this, initialize_curses() below would
+    // allocate fan_speed windows at stale width 8.
+    sizeof_device_field[device_fan_speed] = 11;
     // Reset NVLink probes on all monitored GPUs so they get probed fresh.
     { struct gpu_info *g;
       list_for_each_entry(g, monitoredGpus, list)

From 03a65f3bd162736df3fb7c5db69d32284ac5227f Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Sun, 3 May 2026 18:19:10 -0400
Subject: [PATCH 26/31] # NVTop NVLink Fork - Changelog

Upstream: Syllo/nvtop (commit 095d91c "Remove unused function in ixml")
Fork: danbedford/nvtop, branch `nvlink`
GPU Tested: `NVIDIA GeForce RTX 3090`
Scope: 5 files changed, 706 insertions(+), 19 deletions(-)

---

## Overview

Extends nvtop with per-GPU NVLink info in unused space of the existing interface. When no NVLink-connected GPU is detected, layout and behavior are identical to upstream -- no visual or functional difference. The goal is to bring useful data and throughput to all users of nvtop with NVLink-supported hardware, from consumer (2080, 3090 series) to datacenter (Ampere, Hopper, Blackwell series).

### Main bar (row 2, shown by default)

Appended at end after `power_info` -- NVLink version, link count, and aggregate throughput displayed. Two display states:

NVLink supported device - No bridge or no active links (0-link case, no row 2 padding compaction applied):

    NVL5 0x

With active links (row 2 padding compaction applied, throughput displayed). Example (theoretical fully saturated GB200 with NVLink 5.0):

    NVL518x 1.636 TiB/s

When NVLink is supported but no bridge is connected or links are inactive, only the version and link count display -- no compaction is applied to reclaim space on row 2 since there is no throughput to display. The `NVL5 0x` text extends past the panel edge without affecting the layout. Only when active links exist does fan field compaction kick in (11 to 8 characters) to make room for the throughput value.

- **Label**: `NVL` to represent minimal Label for NVLink.
- **Version**: Marketing NVLink version via `nvmlDeviceGetNvLinkVersion` (raw NVML enum values require remapping):
  - Raw 1 -> NVLink V1.0 -> Display 1
  - Raw 2 -> V2.0 -> 2
  - Raw 3 -> V2.2 -> 2
  - Raw 4 -> V3.0 -> 3
  - Raw 5 -> V3.1 -> 3
  - Raw 6 -> V4.0 -> 4
  - Raw 7 -> V5.0 -> 5
  - Raw 8 -> V6.0 -> 6 (assumed Rubin)
  Display shows single-digit major version due to limited space.
- **Link count**: Total physical links on the device (static hardware property). Maximum is 36 to future-proof for planned Nvidia Rubin.
- **Throughput**: Aggregate Transmit plus Receive utilization, currently read via `nvidia-smi` CLI fallback for all NVLink-connected GPUs. This carries measurable overhead from forking a full binary and parsing its text output, but providing real throughput visibility to consumer GPU users outweighs the cost, and all other non-NVLink users are isolated. The 2-second interval is hardcoded and independent of global nvtop refresh rate to cap CLI calls regardless of display speed. Uses "r" (raw) counters which include payload plus protocol overhead, reflecting true bandwidth utilization. Parses "Link N: Raw Tx: NNNNN KiB" / "Raw Rx" per link. Delta = `(current - previous) / time_delta` per link, summed for aggregate; unsigned underflow guard checks `new >= old` before subtraction. No smoothing applied -- raw accuracy over display smoothness. **TODO:** On datacenter GPUs with `nvmlDeviceGetNvLinkUtilizationCounter`, replace with direct API call; keep CLI fallback for consumer GPUs.
- **Layout compaction**: The Fan field shrinks from 11 to 8 characters ONLY when `any_device_has_nvlink_active` is true (at least one monitored GPU has active NVLink links). GPUs with NVLink hardware but no bridge (0-link case) do NOT get compaction -- `NVL3 0x` extends past the panel edge without needing reclaimed space. Panel width is determined by device name length (`device_name` column = `largest_device_name + 11`), so longer names produce more room for NVLink.
- **Throughput display**: Uses `print_data_at_scale()` (renamed from `print_pcie_at_scale()`) with IEC binary prefixes. Array bounds check extended from `< 5` to `< 6` to support up to Terbibytes/s (TiB/s) for Blackwell NVLink 5.0 devices at ~1.636 TiB/s aggregate. The `memory_prefix[]` array already contains entries up to "Pi" -- only the loop guard needed updating.

### Extra GPU info bar (row 4, not shown by default)

Appended at end after `exec_engines` -- error and correction counters since nvtop launch. Example with zeroed counters:

    NVL E:00000 C:00000

Example with non-zero counters (errors in red, corrections in yellow):

    NVL E:00420 C:00069

- **Label**: `NVL` to represent minimal Label for NVLink.
- **Error counters**: Replay, recovery, CRC FLIT, and CRC DATA errors via `nvmlDeviceGetNvLinkErrorCounter`, summed across all links. Baseline subtraction ensures counters start at zero on nvtop launch.
- **CRC corrections**: Per-lane CRC flit corrections via `nvmlDeviceGetFieldValues` (field IDs 32-247 for links 0-35), summed across all links. Uses modern signature `(device, valuesCount, fieldValues)` with field IDs populated in-place in the `nvmlFieldValue_t` buffer (48 bytes on NVML 11.515+: fieldId at offset 0, scopeId at 4, timestamp at 8, latencyUsec at 16, valueType at 24, nvmlReturn at 28, value.union at 32). Offsets are manually parsed since `nvml.h` is not exposed in the nvtop build.
- Error counters read during refresh cycle (`gpuinfo_nvidia_refresh_dynamic_info()`), not during startup probe (`nvtop_probe_nvlink_list()` calls `nvtop_get_nvlink_info()` before display is drawn). This ensures the baseline is established at the moment of first display refresh, guaranteeing counters read zero on launch. `nvtop_get_nvlink_info()` does NOT read error counters in the display path.

---

## Files Changed

### include/nvtop/extract_gpuinfo_common.h (+31 lines, -1 line)

- `NVTOP_NVLINK_MAX_LINKS` defined to 36
- Flat struct `nvlink_info`: `num_links`, `version`, `supported`, `has_throughput`, `aggregate_tx`, `aggregate_rx`, `total_errors`, `total_corrections`
- `nvtop_get_nvlink_info()`: return cached NVLink data; vendor guard skips non-NVIDIA GPUs before `container_of()`
- `nvtop_get_nvlink_error_counts()`: public getter for display-ready error/correction counts; bridges `interface.c` to per-device error state in `extract_gpuinfo_nvidia.c`
- `nvtop_probe_nvlink_list()`: probe all devices for NVLink support before curses init; short-circuits if `any_device_has_nvlink` already true
- `nvtop_set_nvlink_probe()`: set `any_device_has_nvlink` global flag only (leaves `any_device_has_nvlink_active` untouched)
- `nvtop_reset_nvlink_cache()`: reset all per-device NVLink caching (probe flag, cached linkcount, cached version, cached info struct) on monitored GPU set change; vendor guard for non-NVIDIA

### include/nvtop/interface_internal_common.h (+4 lines, -1 line)

- `WINDOW *nvlink_info` added to `struct device_window` (row 2 throughput)
- `WINDOW *nvlink_errors` added to `struct device_window` (row 4 errors)
- `device_nvlink_errors` added to `enum device_field` with size 19

### src/extract_gpuinfo_nvidia.c (+451 insertions, -1 deletion)

- Four NVML (NVIDIA Management Library) symbols via `dlsym()`: `nvmlDeviceGetNvLinkState`, `nvmlDeviceGetNvLinkVersion`, `nvmlDeviceGetNvLinkErrorCounter`, `nvmlDeviceGetFieldValues` (modern 3-param signature)
- Per-device state: `device_index`, `cli_poll_active`, per-link CLI counters, baseline/display error fields, probe cache (`nvlink_probed`, `nvlink_cached_linkcount`, `nvlink_cached_version`), full struct cache (`cached_nvlink_info`, `cached_nvlink_info_populated`)
- Link discovery: probes links 0-35 via `nvmlDeviceGetNvLinkState`, counts consecutive successes, stops on first hard error or `NVML_ERROR_NOT_SUPPORTED`; only active links (`isActive == 1`) are counted -- physical slots with no bridge are excluded
- Caching: 3 layers -- (1) link count/version via `nvlink_probe_and_cache()`, (2) full struct via `nvlink_refresh_cached_info()`, (3) list-level probe short-circuit in `nvtop_probe_nvlink_list()`; all reset by `nvtop_reset_nvlink_cache()` on GPU set change
- Throughput: `nvidia-smi nvlink --getthroughput r -i <dev>` every 2 seconds (hardcoded, independent of display refresh rate), delta-based rate computation with unsigned underflow guard
- Error reading via `nvlink_read_errors()`: called from `gpuinfo_nvidia_refresh_dynamic_info()` (not `nvtop_get_nvlink_info()`) to ensure baseline is established at first display refresh; reads errors via `nvmlDeviceGetNvLinkErrorCounter` and corrections via `nvmlDeviceGetFieldValues`; unsigned underflow guard prevents counter wrap artifacts

### src/interface.c (+220 insertions, -20 deletions)

- Conditional layout: `any_device_has_nvlink` controls window allocation; `any_device_has_nvlink_active` controls fan compaction (shrinks from 11 to 8 chars only when active links exist -- 0-link devices do not get compaction)
- `device_length()` always uses base layout (clock + mem_clock + temp + fan + power + 5) regardless of NVLink state; NVLink window on line 2 extends past nominal panel edge, which ncurses handles gracefully
- `nvtop_adjust_field_sizes_for_nvlink()` checks `any_device_has_nvlink_active` (not `any_device_has_nvlink`) for fan compaction
- `interface_check_monitored_gpu_change()` resets ALL mutable NVLink state: both global flags plus `sizeof_device_field[device_fan_speed] = 11`, then calls per-device `nvtop_reset_nvlink_cache()`
- Fan N/A fallback branch uses `any_device_has_nvlink_active` for correct 11-character format on 0-link devices
- NVLink info window (row 2): displays `print_data_at_scale()`-formatted throughput (renamed from `print_pcie_at_scale()`; bounds check extended to `< 6` for TiB/s ceiling)
- NVLink errors window (row 4): reads via `nvtop_get_nvlink_error_counts()` (does NOT call `nvtop_get_nvlink_info()` in display path)
- Memory leak fixes: added missing `delwin()` for `shader_cores`, `l2_cache_size`, `exec_engines`, `plots[i].plot_window`, and `nvlink_errors`. Two of these are also submitted as standalone upstream PRs: `free_device_windows()` fix (PR #467) and `plots[i].plot_window` fix (PR #468).

### src/nvtop.c (+5 lines, -1 line)

- `nvtop_probe_nvlink_list()` and `nvtop_set_nvlink_probe()` called before curses init (first layout pass)
- Re-evaluated in main loop after `interface_check_monitored_gpu_change()` for GPU hotplug

---

## Design Decisions

### Flat struct over nested

Single struct per device. Error and correction counters are cumulative totals (unsigned long long) summed across all links. Avoids per-link arrays and dynamic allocation in the hot refresh path.

### Two-tier error state: baseline plus display

Five fields in `struct gpu_info_nvidia` track error state: `baseline_errors`, `baseline_corrections`, `nvlink_errors_baseline_read` (bool), `display_errors`, `display_corrections`. Baselines persist for the entire process lifetime. Display values computed each refresh as `cumulative - baseline`.

### total_errors / total_corrections retained for API compatibility

Populated from `display_errors`/`display_corrections` in `nvlink_refresh_cached_info()`. Primary display path uses `nvtop_get_nvlink_error_counts()`, but both carry the same data.

### No new dependencies

Uses only NVML symbols already in `nvidia-ml` driver and `nvidia-smi` binary already on the system.

---

## What Was Not Changed

Process listing, memory/GPU charts, configuration options, keyboard shortcuts, menu behavior, and all non-NVLink display fields remain identical to upstream.

---

## Testing

Dual RTX 3090 Founders Edition 24GB with 3-slot NVLink Bridge (RTXA6000NVLINK3S-KIT). Displays in UI as: `NVIDIA GeForce RTX 3090`. 4 physical links per GPU. `enum nvmlNvlinkVersion_t` returns `5` representing NVLink v3.1. When idle, NVLink shows ~1.2 MiB/s aggregate residual throughput from assumed protocol keep-alives/link maintenance. Errors/corrections correctly display `E:00000 C:00000` on every launch, incrementing only when new errors occur (no errors experienced to fully confirm).
---
 src/extract_gpuinfo_nvidia.c | 12 ++++++------
 src/interface.c              |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index c50ecd02..3a2b6f1c 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -290,8 +290,8 @@ struct gpu_info_nvidia {
   unsigned long long nvlink_cli_tx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative TX from CLI
   unsigned long long nvlink_cli_rx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative RX from CLI
   nvtop_time last_nvlink_cli_time; // Timestamp of last CLI poll (uses app's existing time API)
-  unsigned long long smoothed_agg_tx; // EMA-smoothed aggregate TX for display
-  unsigned long long smoothed_agg_rx; // EMA-smoothed aggregate RX for display
+  unsigned long long cli_agg_tx; // Computed aggregate TX from CLI polling for display
+  unsigned long long cli_agg_rx; // Computed aggregate RX from CLI polling for display
 
   // NVLink error counter baselines (cumulative since boot, tracked per-device)
   unsigned long long baseline_errors; // Cumulative errors at last read
@@ -1300,8 +1300,8 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
             total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
         }
         // Raw rate (no smoothing — accuracy is more important than display smoothness)
-        gpu_info->smoothed_agg_tx = (unsigned long long)((double)total_tx / delta_s);
-        gpu_info->smoothed_agg_rx = (unsigned long long)((double)total_rx / delta_s);
+        gpu_info->cli_agg_tx = (unsigned long long)((double)total_tx / delta_s);
+        gpu_info->cli_agg_rx = (unsigned long long)((double)total_rx / delta_s);
       }
 
       memcpy(gpu_info->nvlink_cli_tx, cli_tx, linkCount * sizeof(unsigned long long));
@@ -1313,8 +1313,8 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
   // Aggregate throughput
   if (gpu_info->cli_poll_active) {
     cache->has_throughput = true;
-    cache->aggregate_tx = gpu_info->smoothed_agg_tx;
-    cache->aggregate_rx = gpu_info->smoothed_agg_rx;
+    cache->aggregate_tx = gpu_info->cli_agg_tx;
+    cache->aggregate_rx = gpu_info->cli_agg_rx;
   } else {
     cache->has_throughput = false;
     cache->aggregate_tx = 0;
diff --git a/src/interface.c b/src/interface.c
index 5988114e..5d81042c 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -53,8 +53,8 @@ static unsigned int sizeof_device_field[device_field_count] = {
 // Controls whether to allocate the nvlink_info window for displaying "NVL3 0x" etc.
 static bool any_device_has_nvlink = false;
 // True if any monitored device has NVLink with active links (linkCount > 0).
-// Controls layout adjustments (shrinking fan field, adjusting line 2 width)
-// and the nvlink_errors window allocation.
+// Controls layout adjustments (shrinking fan field) and the nvlink_errors
+// window allocation.
 static bool any_device_has_nvlink_active = false;
 
 // When NVLink has ACTIVE links, shrink fan field from 11 to 8 to make room on line 2.
@@ -63,6 +63,8 @@ static bool any_device_has_nvlink_active = false;
 static void nvtop_adjust_field_sizes_for_nvlink(void) {
   if (any_device_has_nvlink_active) {
     sizeof_device_field[device_fan_speed] = 8;  // "FAN %3u%%" (was 11 with padding)
+  } else {
+    sizeof_device_field[device_fan_speed] = 11; // Restore default padding
   }
 }
 

From 666ffed6a3e7108cef3a05ff08691f2f945ee64d Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Wed, 6 May 2026 18:55:55 -0400
Subject: [PATCH 27/31] feat: address PR #469 maintainer review comments

Replace CLI-based NVLink throughput with NVML API and refactor
probe/layout initialization per maintainer feedback.

Comment #1: include nvml.h, remove local typedefs, add #ifndef
guards for enum constants, update dlsym function pointer to use
proper nvmlFieldValue_t type, remove raw memcpy offset macros.

Comment #2: wire nvlink_nvidia_disabled.c stub file into
CMakeLists.txt else-branch for non-NVIDIA builds.

Comment #3: remove per-lane CRC corrections loop from
nvlink_read_errors() (Phase 2) - corrections now read in batched
call in nvlink_refresh_cached_info().

Comment #4: replace nvidia-smi CLI fallback with single batched
nvmlDeviceGetFieldValues call for RAW TX (140), RAW RX (141), and
CRC corrections (38). Use scopeId=UINT_MAX for throughput
aggregate, scopeId=0 for per-device corrections.

Comment #5: remove nvlink_cli_get_throughput() function and CLI
struct fields (device_index, cli_poll_active, nvlink_cli_tx/rx,
last_nvlink_cli_time, cli_agg_tx/rx). Replace with nvlink_last_tx,
nvlink_last_rx, nvlink_last_poll_time.

Comment #6: use struct nvlink_info nvl = {0} initializer in
nvtop_probe_nvlink_list().

Comment #7: move nvtop_adjust_field_sizes_for_nvlink() into
initialize_all_windows(), remove nvtop_set_nvlink_probe() entirely,
swap probe and interface_check_monitored_gpu_change() call order in
nvtop.c main loop, add re-probe in monitored-set-change handler.
---
 include/nvtop/extract_gpuinfo_common.h |   1 -
 src/CMakeLists.txt                     |  15 ++
 src/extract_gpuinfo_nvidia.c           | 272 ++++++++++---------------
 src/interface.c                        |  19 +-
 src/nvlink_nvidia_disabled.c           |  31 +++
 src/nvtop.c                            |   7 +-
 6 files changed, 173 insertions(+), 172 deletions(-)
 create mode 100644 src/nvlink_nvidia_disabled.c

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index bd1fe606..99dfc01c 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -264,7 +264,6 @@ bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
 
 // NVLink probe — call before initialize_curses to set layout mode
 bool nvtop_probe_nvlink_list(struct list_head *devices);
-void nvtop_set_nvlink_probe(bool val);
 
 // Reset per-GPU NVLink cache (probed flag, cached linkcount/version, cached info struct).
 // Call when the monitored device set changes so newly-monitored NVLink GPUs get probed fresh.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b485cb40..d84d42aa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,6 +52,21 @@ endif()
 
 if(NVIDIA_SUPPORT)
   target_sources(nvtop PRIVATE extract_gpuinfo_nvidia.c)
+  find_path(NVML_INCLUDE_DIR nvml.h
+    PATHS /usr/local/cuda-12.8/targets/x86_64-linux/include
+          /usr/local/cuda/targets/x86_64-linux/include
+          /usr/local/cuda/include
+          /usr/lib/x86_64-linux-gnu/nvidia
+          /usr/include
+    NO_DEFAULT_PATH)
+  if (NVML_INCLUDE_DIR)
+    message(STATUS "Found nvml.h at ${NVML_INCLUDE_DIR}")
+    include_directories(${NVML_INCLUDE_DIR})
+  else()
+    message(WARNING "nvml.h not found; NVLink error counters and corrections may be unavailable")
+  endif()
+else()
+  target_sources(nvtop PRIVATE nvlink_nvidia_disabled.c)
 endif()
 
 if(ASCEND_SUPPORT)
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index f64618f4..f451c362 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -25,18 +25,24 @@
 
 #include <dlfcn.h>
 #include <errno.h>
+#include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "nvml.h"
+
+#ifndef NVML_SUCCESS
 #define NVML_SUCCESS 0
+#endif
+#ifndef NVML_ERROR_NOT_SUPPORTED
 #define NVML_ERROR_NOT_SUPPORTED 3
+#endif
+#ifndef NVML_ERROR_INSUFFICIENT_SIZE
 #define NVML_ERROR_INSUFFICIENT_SIZE 7
-
-typedef struct nvmlDevice *nvmlDevice_t;
-typedef int nvmlReturn_t; // store the enum as int
+#endif
 
 // Init and shutdown
 
@@ -212,7 +218,7 @@ nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentM
 static nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, unsigned int *isActive);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t device, unsigned int link, unsigned int *version);
 static nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t device, unsigned int counter, unsigned int link, unsigned long long *value);
-static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t device, unsigned int valuesCount, void *fieldValues);
+static nvmlReturn_t (*nvmlDeviceGetFieldValues)(nvmlDevice_t, unsigned int, nvmlFieldValue_t *);
 
 static void *libnvidia_ml_handle;
 
@@ -284,14 +290,10 @@ struct gpu_info_nvidia {
   bool isInMigMode;
   unsigned long long last_utilization_timestamp;
 
-  // NVLink throughput via nvidia-smi CLI (consumer GPUs like RTX 3090)
-  unsigned int device_index; // For nvidia-smi -i calls
-  bool cli_poll_active; // True once CLI fallback has been successfully initialized
-  unsigned long long nvlink_cli_tx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative TX from CLI
-  unsigned long long nvlink_cli_rx[NVTOP_NVLINK_MAX_LINKS]; // Per-link cumulative RX from CLI
-  nvtop_time last_nvlink_cli_time; // Timestamp of last CLI poll (uses app's existing time API)
-  unsigned long long cli_agg_tx; // Computed aggregate TX from CLI polling for display
-  unsigned long long cli_agg_rx; // Computed aggregate RX from CLI polling for display
+  // NVLink throughput via NVML API (raw counters, aggregate across all links)
+  unsigned long long nvlink_last_tx;       // Cumulative aggregate TX for delta computation
+  unsigned long long nvlink_last_rx;       // Cumulative aggregate RX for delta computation
+  nvtop_time nvlink_last_poll_time;        // Timestamp for poll throttling
 
   // NVLink error counter baselines (cumulative since boot, tracked per-device)
   unsigned long long baseline_errors; // Cumulative errors at last read
@@ -594,7 +596,6 @@ static bool gpuinfo_nvidia_get_device_handles(struct list_head *devices, unsigne
       nvmlReturn_t pciInfoRet = nvmlDeviceGetPciInfo(gpu_infos[*count].gpuhandle, &pciInfo);
       if (pciInfoRet == NVML_SUCCESS) {
         strncpy(gpu_infos[*count].base.pdev, pciInfo.busIdLegacy, PDEV_LEN);
-        gpu_infos[*count].device_index = i;
         list_add_tail(&gpu_infos[*count].base.list, devices);
         *count += 1;
       }
@@ -1011,25 +1012,35 @@ static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) {
     gpuinfo_nvidia_get_process_utilization(gpu_info, _gpu_info->processes_count, _gpu_info->processes);
 }
 
-// NVML NVLink enums (defined locally since we don't have nvml.h)
+// NVML NVLink enums (guarded — nvml.h defines these; local fallback for older drivers)
+#ifndef NVML_NVLINK_MAX_LINKS_INTERNAL
 #define NVML_NVLINK_MAX_LINKS_INTERNAL 36
+#endif
 
+#ifndef NVML_NVLINK_ERROR_DL_REPLAY
 // NVML error counter types
 #define NVML_NVLINK_ERROR_DL_REPLAY   0
 #define NVML_NVLINK_ERROR_DL_RECOVERY 1
 #define NVML_NVLINK_ERROR_DL_CRC_FLIT 2
 #define NVML_NVLINK_ERROR_DL_CRC_DATA 3
 #define NVML_NVLINK_ERROR_DL_ECC_DATA 4
-
-// nvmlFieldValue_t struct layout (from nvml.h — offsets may vary by driver version)
-// Total size: 48 bytes (NVML 11.515+, verified on driver 580.142)
-// Layout: fieldId:u32(0), scopeId:u32(4), timestamp:u64(8), latencyUsec:u64(16),
-//         valueType:u32(24), nvmlReturn:u32(28), value.union(32) [ullVal at offset 32],
-// NOTE: nvmlFieldValue_t layout varies across NVML/driver versions.
-// Always verify against the header shipped with the driver you're targeting.
-#define NVM_LVALUE_FIELD_ID_OFF     0
-#define NVM_LVALUE_UINT64_OFF       32
-#define NVM_LVALUE_SIZE             48
+#endif
+
+// Helper: Query a single NVML field value via nvmlDeviceGetFieldValues.
+// Returns true if the field was successfully read into *out_val.
+static bool nvlink_query_field(nvmlDevice_t device, unsigned int field_id,
+                               unsigned int scope_id, unsigned long long *out_val) {
+    if (!nvmlDeviceGetFieldValues)
+        return false;
+    struct nvmlFieldValue_t fv = {0};
+    fv.fieldId = field_id;
+    fv.scopeId = scope_id;
+    nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 1, &fv);
+    if (ret != NVML_SUCCESS || fv.nvmlReturn != NVML_SUCCESS)
+        return false;
+    *out_val = fv.value.ullVal;
+    return true;
+}
 
 // Probe NVLink link count and version, caching results in gpu_info_nvidia to avoid
 // repeated NVML API calls on every refresh cycle. linkCount and version are static
@@ -1084,13 +1095,12 @@ unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
   return linkCount;
 }
 
-// Read NVLink error counters and CRC corrections, storing results in the persistent gpu_info struct.
-// Uses baseline subtraction to show only errors/corrections since nvtop launch (Option B).
+// Read NVLink error counters (replay, recovery, CRC), storing results in the persistent gpu_info struct.
+// Uses baseline subtraction to show only errors since nvtop launch (Option B).
 // Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
-// Phase 1: nvmlDeviceGetNvLinkErrorCounter for replay, recovery, CRC errors per link.
-// Phase 2: nvmlDeviceGetFieldValues for per-lane CRC flit corrections (field IDs 32-247 for links 0-35).
+// Corrections are read separately in nvlink_refresh_cached_info() via NVML batched field query.
 static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info) {
-  // Phase 1: error counters via nvmlDeviceGetNvLinkErrorCounter
+  // Error counters via nvmlDeviceGetNvLinkErrorCounter
   unsigned long long cumulative_errors = 0;
   if (nvmlDeviceGetNvLinkErrorCounter) {
     for (unsigned int link = 0; link < linkCount; link++) {
@@ -1111,52 +1121,17 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
     }
   }
 
-  // Phase 2: per-lane CRC corrections via nvmlDeviceGetFieldValues
-  // Field IDs: link 0 = 32-37, link 1 = 38-43, link 2 = 44-49, etc. (6 field IDs per link for lanes 0-5, up to link 35 = 242-247)
-  // The caller must populate fieldId in each nvmlFieldValue_t entry BEFORE calling;
-  // the library populates the value fields on return.
-  // nvmlFieldValue_t is 48 bytes: fieldId:u32(0), scopeId:u32(4), timestamp:u64(8),
-  // latencyUsec:u64(16), valueType:u32(24), nvmlReturn:u32(28), value.union(32).
-  unsigned long long cumulative_corrections = 0;
-  if (nvmlDeviceGetFieldValues) {
-    for (unsigned int link = 0; link < linkCount; link++) {
-      int base_field_id = 32 + link * 6;
-      char raw[6 * NVM_LVALUE_SIZE];
-
-      // Zero out the buffer, then populate fieldId in each entry (offset 0, uint32_t)
-      memset(raw, 0, sizeof(raw));
-      for (int i = 0; i < 6; i++) {
-        unsigned int fid = (unsigned int)(base_field_id + i);
-        memcpy(raw + i * NVM_LVALUE_SIZE + NVM_LVALUE_FIELD_ID_OFF, &fid, sizeof(fid));
-      }
-
-      nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 6, raw);
-      if (ret == NVML_SUCCESS) {
-        for (int i = 0; i < 6; i++) {
-          unsigned long long val = 0;
-          memcpy(&val, raw + i * NVM_LVALUE_SIZE + NVM_LVALUE_UINT64_OFF, sizeof(val));
-          cumulative_corrections += val;
-        }
-      }
-    }
-  }
-
-  // Baseline subtraction: show only errors/corrections since nvtop launch
+  // Baseline subtraction: show only errors since nvtop launch
   if (!gpu_info->nvlink_errors_baseline_read) {
     // First read — establish baseline, display zeros
     gpu_info->baseline_errors = cumulative_errors;
-    gpu_info->baseline_corrections = cumulative_corrections;
     gpu_info->nvlink_errors_baseline_read = true;
     gpu_info->display_errors = 0;
-    gpu_info->display_corrections = 0;
   } else {
     // Subsequent reads — show delta from baseline
     gpu_info->display_errors = cumulative_errors > gpu_info->baseline_errors
                                   ? cumulative_errors - gpu_info->baseline_errors
                                   : 0;
-    gpu_info->display_corrections = cumulative_corrections > gpu_info->baseline_corrections
-                                       ? cumulative_corrections - gpu_info->baseline_corrections
-                                       : 0;
   }
 }
 
@@ -1178,45 +1153,6 @@ bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
   return true;
 }
 
-// Parse nvidia-smi nvlink --getthroughput r output
-// "r" (raw) includes payload + protocol overhead — needed to show true bandwidth utilization
-// (consumer GPUs do not expose NVML nvmlDeviceGetNvLinkUtilizationCounter)
-// Returns number of links parsed (0 on failure)
-static unsigned nvlink_cli_get_throughput(int device_index, unsigned int link_count,
-                                          unsigned long long *tx_out, unsigned long long *rx_out) {
-  char cmd[256];
-  snprintf(cmd, sizeof(cmd), "nvidia-smi nvlink --getthroughput r -i %d 2>/dev/null", device_index);
-
-  FILE *fp = popen(cmd, "r");
-  if (!fp)
-    return 0;
-
-  char line[512];
-  unsigned parsed = 0;
-  memset(tx_out, 0, link_count * sizeof(unsigned long long));
-  memset(rx_out, 0, link_count * sizeof(unsigned long long));
-
-  while (fgets(line, sizeof(line), fp)) {
-    int link = -1;
-    unsigned long long val = 0;
-    char *p = line;
-    while (*p == '\t' || *p == ' ')
-      p++;
-    if (sscanf(p, "Link %u: Raw Tx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
-      tx_out[link] = val;
-      parsed++;
-    } else if (sscanf(p, "Link %u: Raw Rx: %llu", &link, &val) == 2 && (unsigned)link < link_count) {
-      rx_out[link] = val;
-      parsed++;
-    }
-    if (parsed >= link_count * 2)
-      break;
-  }
-
-  pclose(fp);
-  return parsed >= (unsigned)link_count * 2 ? link_count : 0;
-}
-
 // Remap raw NVML NVLink protocol version to the marketing version.
 // NVML raw values do NOT equal marketing versions (raw 5 = 3.1 -> rounds to 3).
 static unsigned int nvlink_marketing_version(unsigned int raw_version) {
@@ -1234,9 +1170,8 @@ static unsigned int nvlink_marketing_version(unsigned int raw_version) {
   }
 }
 
-// Get NVLink info (version, link count, aggregate throughput via CLI).
-// Designed for consumer GPUs (RTX 3090) where NVML utilization counters are unavailable.
-// Populate cached_nvlink_info with link count, version, throughput, and error counts.
+// Get NVLink info (version, link count, aggregate throughput via NVML API).
+// Populate cached_nvlink_info with link count, version, throughput, and error/correction counts.
 // Called from refresh_dynamic_info on every refresh cycle (refresh path).
 // GPUs are non-hot-swappable, so all NVLink data is computed here and cached —
 // nvtop_get_nvlink_info() in the draw path just returns the cached copy.
@@ -1258,70 +1193,83 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
     return;
   }
 
-  // Throughput via nvidia-smi CLI (NVML utilization counters unavailable on consumer GPUs).
-  // Poll every 2 seconds to keep CPU overhead low.
-  //
-  // TODO: On datacenter GPUs (A100, H100, etc.) that expose NVML
-  // nvmlDeviceGetNvLinkUtilizationCounter, replace this CLI path with the
-  // direct API call (zero process overhead). Keep this nvidia-smi CLI code
-  // as a conditional fallback for consumer GPUs (RTX 3090, 3080 Ti) where
-  // the NVML utilization counter is not exposed.
-  // Hardcoded 2-second CLI poll interval — independent of global nvtop refresh rate.
-  // nvidia-smi is a resource-heavy process (full binary fork + text parsing). This
-  // throttles the expensive popen/pclose calls to a maximum of one per 2 seconds,
-  // minimizing resource usage regardless of how fast the user sets the display refresh.
-  // A faster global refresh (e.g. 0.5s) would otherwise fork nvidia-smi far too often,
-  // degrading overall system performance. The delta-based rate computation
-  // (total_bytes / delta_s) normalizes to a per-second value, so the displayed
-  // throughput remains accurate even with a 2-second sample window.
+  // Throughput and corrections via NVML API in a single batched call.
+  // RAW fields (140/141) include protocol overhead; DATA fields (138/139) return
+  // identical TX/RX on consumer GPUs with aggregate scopeId, yielding zero throughput.
+  // Field 38 (CRC corrections) is already per-device aggregate -- scopeId=0.
+  // Poll every 2 seconds to keep API call frequency reasonable.
   nvtop_time current_time;
   nvtop_get_current_time(&current_time);
-  if (gpu_info->last_nvlink_cli_time.tv_sec == 0 ||
-      nvtop_difftime(gpu_info->last_nvlink_cli_time, current_time) >= 2.) {
-
-    unsigned long long cli_tx[NVTOP_NVLINK_MAX_LINKS] = {0};
-    unsigned long long cli_rx[NVTOP_NVLINK_MAX_LINKS] = {0};
-
-    if (nvlink_cli_get_throughput(gpu_info->device_index, linkCount, cli_tx, cli_rx)) {
-      gpu_info->cli_poll_active = true;
-
-      if (gpu_info->last_nvlink_cli_time.tv_sec > 0) {
-        double delta_s = nvtop_difftime(gpu_info->last_nvlink_cli_time, current_time);
-        if (delta_s <= 0.) delta_s = 1e-9;
-
-        unsigned long long total_tx = 0, total_rx = 0;
-        for (unsigned int link = 0; link < linkCount; link++) {
-          // Guard against unsigned underflow if the hardware counter wraps or resets.
-          // If the new reading is less than the stored reading, skip this link to
-          // avoid a delta near ULLONG_MAX that would produce an absurd throughput spike.
-          if (cli_tx[link] >= gpu_info->nvlink_cli_tx[link])
-            total_tx += cli_tx[link] - gpu_info->nvlink_cli_tx[link];
-          if (cli_rx[link] >= gpu_info->nvlink_cli_rx[link])
-            total_rx += cli_rx[link] - gpu_info->nvlink_cli_rx[link];
-        }
-        // Raw rate (no smoothing — accuracy is more important than display smoothness)
-        gpu_info->cli_agg_tx = (unsigned long long)((double)total_tx / delta_s);
-        gpu_info->cli_agg_rx = (unsigned long long)((double)total_rx / delta_s);
-      }
+  double delta_s = (gpu_info->nvlink_last_poll_time.tv_sec > 0)
+                     ? nvtop_difftime(gpu_info->nvlink_last_poll_time, current_time)
+                     : 0;
+
+  // Single batched nvmlDeviceGetFieldValues call for TX, RX, and corrections.
+  // Each entry's nvmlReturn field is checked individually for validity.
+  struct nvmlFieldValue_t batch[3] = {0};
+  batch[0].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX;
+  batch[0].scopeId = UINT_MAX;
+  batch[1].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX;
+  batch[1].scopeId = UINT_MAX;
+  batch[2].fieldId = NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL;
+  batch[2].scopeId = 0;
+
+  unsigned long long new_tx = 0, new_rx = 0, new_corrections = 0;
+  bool got_tx = false, got_rx = false, got_corrections = false;
 
-      memcpy(gpu_info->nvlink_cli_tx, cli_tx, linkCount * sizeof(unsigned long long));
-      memcpy(gpu_info->nvlink_cli_rx, cli_rx, linkCount * sizeof(unsigned long long));
+  if (nvmlDeviceGetFieldValues) {
+    nvmlReturn_t ret = nvmlDeviceGetFieldValues(gpu_info->gpuhandle, 3, batch);
+    if (ret == NVML_SUCCESS) {
+      if (batch[0].nvmlReturn == NVML_SUCCESS) {
+        new_tx = batch[0].value.ullVal;
+        got_tx = true;
+      }
+      if (batch[1].nvmlReturn == NVML_SUCCESS) {
+        new_rx = batch[1].value.ullVal;
+        got_rx = true;
+      }
+      if (batch[2].nvmlReturn == NVML_SUCCESS) {
+        new_corrections = batch[2].value.ullVal;
+        got_corrections = true;
+      }
     }
-    gpu_info->last_nvlink_cli_time = current_time;
   }
 
-  // Aggregate throughput
-  if (gpu_info->cli_poll_active) {
-    cache->has_throughput = true;
-    cache->aggregate_tx = gpu_info->cli_agg_tx;
-    cache->aggregate_rx = gpu_info->cli_agg_rx;
+  // Throughput delta computation (TX + RX)
+  if (got_tx || got_rx) {
+    if (gpu_info->nvlink_last_poll_time.tv_sec > 0 && delta_s > 0) {
+      unsigned long long delta_tx = (new_tx >= gpu_info->nvlink_last_tx)
+                                       ? new_tx - gpu_info->nvlink_last_tx : 0;
+      unsigned long long delta_rx = (new_rx >= gpu_info->nvlink_last_rx)
+                                       ? new_rx - gpu_info->nvlink_last_rx : 0;
+      cache->aggregate_tx = (unsigned long long)((double)delta_tx / delta_s);
+      cache->aggregate_rx = (unsigned long long)((double)delta_rx / delta_s);
+      cache->has_throughput = true;
+    } else {
+      cache->has_throughput = false;
+    }
+    gpu_info->nvlink_last_tx = new_tx;
+    gpu_info->nvlink_last_rx = new_rx;
   } else {
     cache->has_throughput = false;
     cache->aggregate_tx = 0;
     cache->aggregate_rx = 0;
   }
+  gpu_info->nvlink_last_poll_time = current_time;
+
+  // Corrections -- use same baseline subtraction pattern as errors
+  if (got_corrections) {
+    if (!gpu_info->nvlink_errors_baseline_read) {
+      gpu_info->baseline_corrections = new_corrections;
+      gpu_info->display_corrections = 0;
+      gpu_info->nvlink_errors_baseline_read = true;
+    } else {
+      gpu_info->display_corrections = new_corrections > gpu_info->baseline_corrections
+                                        ? new_corrections - gpu_info->baseline_corrections : 0;
+    }
+  }
 
-  // Error/correction counts from display-ready fields (populated by nvlink_read_errors)
+  // Error/correction counts from display-ready fields
   cache->total_errors = gpu_info->display_errors;
   cache->total_corrections = gpu_info->display_corrections;
 
@@ -1385,4 +1333,10 @@ void nvtop_reset_nvlink_cache(struct gpu_info *_gpu_info) {
   gpu_info->nvlink_cached_version = 0;
   gpu_info->cached_nvlink_info_populated = false;
   memset(&gpu_info->cached_nvlink_info, 0, sizeof(gpu_info->cached_nvlink_info));
+  gpu_info->baseline_errors = 0;
+  gpu_info->baseline_corrections = 0;
+  gpu_info->nvlink_errors_baseline_read = false;
+  gpu_info->nvlink_last_tx = 0;
+  gpu_info->nvlink_last_rx = 0;
+  gpu_info->nvlink_last_poll_time = (struct timespec){0};
 }
diff --git a/src/interface.c b/src/interface.c
index 2bb8a5e2..e0cc2013 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -79,8 +79,7 @@ bool nvtop_probe_nvlink_list(struct list_head *devices) {
 
   struct gpu_info *gpu;
   list_for_each_entry(gpu, devices, list) {
-    struct nvlink_info nvl;
-    memset(&nvl, 0, sizeof(nvl));
+    struct nvlink_info nvl = {0};
     // nvtop_get_nvlink_info returns num_links (could be 0 for "supported but no bridge").
     // Check nvl.supported separately to catch the 0-link case.
     nvtop_get_nvlink_info(gpu, &nvl);
@@ -93,18 +92,10 @@ bool nvtop_probe_nvlink_list(struct list_head *devices) {
 
   any_device_has_nvlink = has_nvlink;
   any_device_has_nvlink_active = has_nvlink_active;
-  nvtop_adjust_field_sizes_for_nvlink();
 
   return has_nvlink;
 }
 
-void nvtop_set_nvlink_probe(bool val) {
-  any_device_has_nvlink = val;
-  // Do NOT touch any_device_has_nvlink_active — it was already set correctly
-  // inside nvtop_probe_nvlink_list() with the proper distinction between
-  // "hardware present" and "links active".
-}
-
 static unsigned int sizeof_process_field[process_field_count] = {
     [process_pid] = 7,       [process_user] = 4,          [process_gpu_id] = 3,   [process_type] = 8,
     [process_gpu_rate] = 4,  [process_enc_rate] = 4,      [process_dec_rate] = 4,
@@ -460,6 +451,10 @@ static void initialize_all_windows(struct nvtop_interface *dwin) {
   struct window_position plot_positions[MAX_CHARTS];
   struct window_position setup_position;
 
+  // NVLink layout adjustments must happen before panel dimensions are computed.
+  // any_device_has_nvlink_active is set by the probe that runs before this function.
+  nvtop_adjust_field_sizes_for_nvlink();
+
   compute_sizes_from_layout(devices_count, dwin->options.has_gpu_info_bar ? 4 : 3, device_length(), rows - 1, cols,
                             dwin->options.gpu_specific_opts, dwin->options.process_fields_displayed, device_positions,
                             &dwin->num_plots, plot_positions, map_device_to_plot, &process_position, &setup_position,
@@ -2247,6 +2242,10 @@ void interface_check_monitored_gpu_change(struct nvtop_interface **interface, un
       list_for_each_entry(g, monitoredGpus, list)
         nvtop_reset_nvlink_cache(g);
     }
+    // Re-probe NVLink now that caches are cleared, so that
+    // any_device_has_nvlink_active is correct when initialize_curses()
+    // calls initialize_all_windows() for layout decisions.
+    nvtop_probe_nvlink_list(monitoredGpus);
     *num_monitored_gpus =
         interface_check_and_fix_monitored_gpus(allDevCount, monitoredGpus, nonMonitoredGpus, &options_copy);
     clean_ncurses(*interface);
diff --git a/src/nvlink_nvidia_disabled.c b/src/nvlink_nvidia_disabled.c
new file mode 100644
index 00000000..1e4750ef
--- /dev/null
+++ b/src/nvlink_nvidia_disabled.c
@@ -0,0 +1,31 @@
+/*
+ * Fallback implementations for NVLink functions when NVIDIA support is disabled.
+ * All return 0 / false / no-op to keep the build clean when no NVIDIA GPUs
+ * are present at compile time.
+ */
+
+#include "nvtop/extract_gpuinfo_common.h"
+
+unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info) {
+  (void)gpu_info;
+  (void)nvlink_info;
+  return 0;
+}
+
+bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
+                                    unsigned long long *out_errors,
+                                    unsigned long long *out_corrections) {
+  (void)gpu_info;
+  (void)out_errors;
+  (void)out_corrections;
+  return false;
+}
+
+bool nvtop_probe_nvlink_list(struct list_head *devices) {
+  (void)devices;
+  return false;
+}
+
+void nvtop_reset_nvlink_cache(struct gpu_info *gpu_info) {
+  (void)gpu_info;
+}
diff --git a/src/nvtop.c b/src/nvtop.c
index 12314cd8..8c1598e0 100644
--- a/src/nvtop.c
+++ b/src/nvtop.c
@@ -313,7 +313,7 @@ int main(int argc, char **argv) {
       interface_check_and_fix_monitored_gpus(allDevCount, &monitoredGpus, &nonMonitoredGpus, &allDevicesOptions);
 
   // Probe for NVLink before layout computation
-  nvtop_set_nvlink_probe(nvtop_probe_nvlink_list(&monitoredGpus));
+  nvtop_probe_nvlink_list(&monitoredGpus);
 
   if (allDevicesOptions.show_startup_messages) {
     bool dont_show_again = show_information_messages(numWarningMessages, warningMessages);
@@ -337,8 +337,11 @@ int main(int argc, char **argv) {
       signal_cont_received = 0;
       update_window_size_to_terminal_size(interface);
     }
+    // Probe NVLink state BEFORE monitored-set-change check, so that
+    // any_device_has_nvlink_active is set before initialize_all_windows()
+    // reads it for layout decisions.
+    nvtop_probe_nvlink_list(&monitoredGpus);
     interface_check_monitored_gpu_change(&interface, allDevCount, &numMonitoredGpus, &monitoredGpus, &nonMonitoredGpus);
-    nvtop_set_nvlink_probe(nvtop_probe_nvlink_list(&monitoredGpus));
     if (time_slept >= interface_update_interval(interface)) {
       gpuinfo_refresh_dynamic_info(&monitoredGpus);
       if (!interface_freeze_processes(interface)) {

From 47a6cf8ba832c65115c047308393d1a270d5b8a0 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Wed, 6 May 2026 20:57:48 -0400
Subject: [PATCH 28/31] fix: replace #include nvml.h with manual
 nvmlFieldValue_t typedef
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nvml.h cannot be included directly — nvtop uses dlsym function pointers
for all NVML functions, and including nvml.h would conflict with 373
function prototypes and 12 struct/enum typedefs.

Instead, manually declare nvmlFieldValue_t and its dependencies
(nvmlValue_t, nvmlValueType_t, nvmlReturn_t, nvmlDevice_t) inline.
This satisfies the maintainer requirement to use the proper struct type
instead of raw memcpy offsets, without breaking the dlsym architecture.

Also removes the unused find_path(NVML_INCLUDE_DIR) from CMakeLists.txt
and fixes the forward declaration of nvlink_read_errors to use
nvmlDevice_t instead of struct nvmlDevice*.
---
 src/CMakeLists.txt           | 13 ------
 src/extract_gpuinfo_nvidia.c | 76 ++++++++++++++++++++++++++++++------
 2 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d84d42aa..8d92f592 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,19 +52,6 @@ endif()
 
 if(NVIDIA_SUPPORT)
   target_sources(nvtop PRIVATE extract_gpuinfo_nvidia.c)
-  find_path(NVML_INCLUDE_DIR nvml.h
-    PATHS /usr/local/cuda-12.8/targets/x86_64-linux/include
-          /usr/local/cuda/targets/x86_64-linux/include
-          /usr/local/cuda/include
-          /usr/lib/x86_64-linux-gnu/nvidia
-          /usr/include
-    NO_DEFAULT_PATH)
-  if (NVML_INCLUDE_DIR)
-    message(STATUS "Found nvml.h at ${NVML_INCLUDE_DIR}")
-    include_directories(${NVML_INCLUDE_DIR})
-  else()
-    message(WARNING "nvml.h not found; NVLink error counters and corrections may be unavailable")
-  endif()
 else()
   target_sources(nvtop PRIVATE nvlink_nvidia_disabled.c)
 endif()
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index f451c362..fbfed046 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -32,16 +32,70 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "nvml.h"
-
-#ifndef NVML_SUCCESS
-#define NVML_SUCCESS 0
+// We do NOT include nvml.h — nvtop uses dlsym function pointers for all NVML
+// functions, and including nvml.h would conflict with those declarations.
+// Instead, we manually declare nvmlFieldValue_t and its dependencies here.
+// This satisfies the maintainer's requirement to use the proper struct type
+// instead of raw memcpy offsets, without breaking the dlsym architecture.
+
+// Core NVML types needed throughout the file (from nvml.h — cannot include directly
+// due to dlsym function pointer conflicts with nvtop's architecture).
+
+// NVML return codes (subset — we only use NVML_SUCCESS and NVML_ERROR_NOT_SUPPORTED)
+typedef enum nvmlReturn_enum {
+    NVML_SUCCESS = 0,
+    NVML_ERROR_UNINITIALIZED = 1,
+    NVML_ERROR_INVALID_ARGUMENT = 2,
+    NVML_ERROR_NOT_SUPPORTED = 3,
+    NVML_ERROR_NO_PERMISSION = 4,
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,
+} nvmlReturn_t;
+
+// Opaque device handle (nvml.h defines as struct nvmlDevice_st*)
+typedef struct nvmlDevice_st *nvmlDevice_t;
+
+// nvmlFieldValue_t and its dependencies (manually declared to avoid including nvml.h).
+// These match nvml.h struct/enum definitions from CUDA 12.x.
+typedef enum nvmlValueType_enum {
+    NVML_VALUE_TYPE_DOUBLE = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
+    NVML_VALUE_TYPE_SIGNED_INT = 5,
+    NVML_VALUE_TYPE_UNSIGNED_SHORT = 6,
+    NVML_VALUE_TYPE_COUNT
+} nvmlValueType_t;
+
+typedef union nvmlValue_st {
+    double dVal;
+    int siVal;
+    unsigned int uiVal;
+    unsigned long ulVal;
+    unsigned long long ullVal;
+    signed long long sllVal;
+    unsigned short usVal;
+} nvmlValue_t;
+
+typedef struct nvmlFieldValue_st {
+    unsigned int fieldId;
+    unsigned int scopeId;
+    long long timestamp;
+    long long latencyUsec;
+    nvmlValueType_t valueType;
+    nvmlReturn_t nvmlReturn;
+    nvmlValue_t value;
+} nvmlFieldValue_t;
+
+// NVML field IDs for NVLink throughput and CRC corrections (from nvml.h)
+#ifndef NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX
+#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140
 #endif
-#ifndef NVML_ERROR_NOT_SUPPORTED
-#define NVML_ERROR_NOT_SUPPORTED 3
+#ifndef NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX
+#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141
 #endif
-#ifndef NVML_ERROR_INSUFFICIENT_SIZE
-#define NVML_ERROR_INSUFFICIENT_SIZE 7
+#ifndef NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38
 #endif
 
 // Init and shutdown
@@ -327,7 +381,7 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info);
 static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info);
 
 // Forward declaration for nvlink_read_errors (defined later, called from refresh_dynamic_info)
-static void nvlink_read_errors(struct nvmlDevice *device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info);
+static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info);
 
 // Forward declaration for nvlink_refresh_cached_info (defined later, called from refresh_dynamic_info)
 // Populates gpu_info->cached_nvlink_info with throughput + error data.
@@ -1032,7 +1086,7 @@ static bool nvlink_query_field(nvmlDevice_t device, unsigned int field_id,
                                unsigned int scope_id, unsigned long long *out_val) {
     if (!nvmlDeviceGetFieldValues)
         return false;
-    struct nvmlFieldValue_t fv = {0};
+    nvmlFieldValue_t fv = {0};
     fv.fieldId = field_id;
     fv.scopeId = scope_id;
     nvmlReturn_t ret = nvmlDeviceGetFieldValues(device, 1, &fv);
@@ -1206,7 +1260,7 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
 
   // Single batched nvmlDeviceGetFieldValues call for TX, RX, and corrections.
   // Each entry's nvmlReturn field is checked individually for validity.
-  struct nvmlFieldValue_t batch[3] = {0};
+  nvmlFieldValue_t batch[3] = {0};
   batch[0].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX;
   batch[0].scopeId = UINT_MAX;
   batch[1].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX;

From b97dac8d0c716544dfe59907ac5b0d8700c27f31 Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Thu, 7 May 2026 09:44:51 -0400
Subject: [PATCH 29/31] fix: use {0} initializer for nvl_info in draw_devices()
 NVLink display

---
 src/interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface.c b/src/interface.c
index e0cc2013..2db11eca 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -947,7 +947,7 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
     // NVLink info (on same row as power_info)
     if (dev->nvlink_info != NULL) {
       werase(dev->nvlink_info);
-      struct nvlink_info nvl_info;
+      struct nvlink_info nvl_info = {0};
       nvtop_get_nvlink_info(device, &nvl_info);
       if (nvl_info.supported) {
         wcolor_set(dev->nvlink_info, cyan_color, NULL);

From a226ed77c2425cfac2981747461f9559d56a26ed Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Thu, 7 May 2026 11:38:11 -0400
Subject: [PATCH 30/31] feat: add NVML field 160 ECC data errors to batched
 NVLink query

Per maintainer suggestion in PR #469 Comment 3, add field 160
(NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL) to the existing
batched nvmlDeviceGetFieldValues call alongside throughput and CRC
corrections.

- Batch expanded from 3 to 4 fields (scopeId=0, per-device aggregate)
- Added total_ecc_errors to struct nvlink_info
- Added baseline_ecc_errors/display_ecc_errors to struct gpu_info_nvidia
- Display format: NVL E:00000 C:00000 X:00000 (window width 19->28)
- Updated nvtop_get_nvlink_error_counts() to return ECC count
- Updated stub file and cache reset accordingly
---
 include/nvtop/extract_gpuinfo_common.h |  6 ++--
 src/extract_gpuinfo_nvidia.c           | 50 ++++++++++++++++++++------
 src/interface.c                        | 13 ++++---
 src/nvlink_nvidia_disabled.c           |  4 ++-
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
index 99dfc01c..5c0067be 100644
--- a/include/nvtop/extract_gpuinfo_common.h
+++ b/include/nvtop/extract_gpuinfo_common.h
@@ -252,15 +252,17 @@ struct nvlink_info {
   unsigned long long aggregate_rx;    // Aggregate RX throughput across all links (KiB/s)
   unsigned long long total_errors;    // Cumulative-since-launch errors across all links
   unsigned long long total_corrections; // Cumulative-since-launch CRC corrections across all links
+  unsigned long long total_ecc_errors; // Cumulative-since-launch ECC data errors across all links
 };
 
 unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nvlink_info);
 
-// Get display-ready NVLink error/correction counts from the per-device persistent struct.
+// Get display-ready NVLink error/correction/ECC counts from the per-device persistent struct.
 // Returns true if baseline has been established at least once.
 bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
                                     unsigned long long *out_errors,
-                                    unsigned long long *out_corrections);
+                                    unsigned long long *out_corrections,
+                                    unsigned long long *out_ecc);
 
 // NVLink probe — call before initialize_curses to set layout mode
 bool nvtop_probe_nvlink_list(struct list_head *devices);
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
index fbfed046..e66bd577 100644
--- a/src/extract_gpuinfo_nvidia.c
+++ b/src/extract_gpuinfo_nvidia.c
@@ -97,6 +97,9 @@ typedef struct nvmlFieldValue_st {
 #ifndef NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL
 #define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38
 #endif
+#ifndef NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL
+#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160
+#endif
 
 // Init and shutdown
 
@@ -352,11 +355,13 @@ struct gpu_info_nvidia {
   // NVLink error counter baselines (cumulative since boot, tracked per-device)
   unsigned long long baseline_errors; // Cumulative errors at last read
   unsigned long long baseline_corrections; // Cumulative corrections at last read
+  unsigned long long baseline_ecc_errors; // Cumulative ECC data errors at last read
   bool nvlink_errors_baseline_read; // True after first read establishes baseline
 
-  // Display-ready error/correction counts (computed in refresh_dynamic_info)
+  // Display-ready error/correction/ECC counts (computed in refresh_dynamic_info)
   unsigned long long display_errors; // Errors since nvtop launch
   unsigned long long display_corrections; // Corrections since nvtop launch
+  unsigned long long display_ecc_errors; // ECC data errors since nvtop launch
 
   // Cached NVLink hardware properties (probe once, reuse forever)
   bool nvlink_probed; // true after first probe, regardless of result
@@ -1152,7 +1157,7 @@ unsigned nvlink_probe_and_cache(struct gpu_info_nvidia *gpu_info) {
 // Read NVLink error counters (replay, recovery, CRC), storing results in the persistent gpu_info struct.
 // Uses baseline subtraction to show only errors since nvtop launch (Option B).
 // Called from refresh_dynamic_info so it does NOT run during the startup probe in nvtop_probe_nvlink_list.
-// Corrections are read separately in nvlink_refresh_cached_info() via NVML batched field query.
+// Corrections and ECC data errors are read separately in nvlink_refresh_cached_info() via NVML batched field query.
 static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, struct gpu_info_nvidia *gpu_info) {
   // Error counters via nvmlDeviceGetNvLinkErrorCounter
   unsigned long long cumulative_errors = 0;
@@ -1189,11 +1194,12 @@ static void nvlink_read_errors(nvmlDevice_t device, unsigned int linkCount, stru
   }
 }
 
-// Public getter for display-ready error/correction counts from a struct gpu_info.
-// Returns true if data is available (errors or corrections read at least once).
+// Public getter for display-ready error/correction/ECC counts from a struct gpu_info.
+// Returns true if baseline has been established at least once.
 bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
                                     unsigned long long *out_errors,
-                                    unsigned long long *out_corrections) {
+                                    unsigned long long *out_corrections,
+                                    unsigned long long *out_ecc) {
   // NVLink is an NVIDIA-only technology — skip non-NVIDIA GPUs immediately
   if (strcmp(_gpu_info->vendor->name, "NVIDIA"))
     return false;
@@ -1204,6 +1210,7 @@ bool nvtop_get_nvlink_error_counts(struct gpu_info *_gpu_info,
   }
   *out_errors = gpu_info->display_errors;
   *out_corrections = gpu_info->display_corrections;
+  *out_ecc = gpu_info->display_ecc_errors;
   return true;
 }
 
@@ -1243,6 +1250,7 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
     cache->aggregate_rx = 0;
     cache->total_errors = 0;
     cache->total_corrections = 0;
+    cache->total_ecc_errors = 0;
     gpu_info->cached_nvlink_info_populated = true;
     return;
   }
@@ -1258,21 +1266,23 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
                      ? nvtop_difftime(gpu_info->nvlink_last_poll_time, current_time)
                      : 0;
 
-  // Single batched nvmlDeviceGetFieldValues call for TX, RX, and corrections.
+  // Single batched nvmlDeviceGetFieldValues call for TX, RX, corrections, and ECC errors.
   // Each entry's nvmlReturn field is checked individually for validity.
-  nvmlFieldValue_t batch[3] = {0};
+  nvmlFieldValue_t batch[4] = {0};
   batch[0].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX;
   batch[0].scopeId = UINT_MAX;
   batch[1].fieldId = NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX;
   batch[1].scopeId = UINT_MAX;
   batch[2].fieldId = NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL;
   batch[2].scopeId = 0;
+  batch[3].fieldId = NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL;
+  batch[3].scopeId = 0;
 
-  unsigned long long new_tx = 0, new_rx = 0, new_corrections = 0;
-  bool got_tx = false, got_rx = false, got_corrections = false;
+  unsigned long long new_tx = 0, new_rx = 0, new_corrections = 0, new_ecc_errors = 0;
+  bool got_tx = false, got_rx = false, got_corrections = false, got_ecc_errors = false;
 
   if (nvmlDeviceGetFieldValues) {
-    nvmlReturn_t ret = nvmlDeviceGetFieldValues(gpu_info->gpuhandle, 3, batch);
+    nvmlReturn_t ret = nvmlDeviceGetFieldValues(gpu_info->gpuhandle, 4, batch);
     if (ret == NVML_SUCCESS) {
       if (batch[0].nvmlReturn == NVML_SUCCESS) {
         new_tx = batch[0].value.ullVal;
@@ -1286,6 +1296,10 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
         new_corrections = batch[2].value.ullVal;
         got_corrections = true;
       }
+      if (batch[3].nvmlReturn == NVML_SUCCESS) {
+        new_ecc_errors = batch[3].value.ullVal;
+        got_ecc_errors = true;
+      }
     }
   }
 
@@ -1323,9 +1337,22 @@ static void nvlink_refresh_cached_info(struct gpu_info_nvidia *gpu_info, unsigne
     }
   }
 
-  // Error/correction counts from display-ready fields
+  // ECC data errors -- use same baseline subtraction pattern as errors/corrections
+  if (got_ecc_errors) {
+    if (!gpu_info->nvlink_errors_baseline_read) {
+      gpu_info->baseline_ecc_errors = new_ecc_errors;
+      gpu_info->display_ecc_errors = 0;
+      gpu_info->nvlink_errors_baseline_read = true;
+    } else {
+      gpu_info->display_ecc_errors = new_ecc_errors > gpu_info->baseline_ecc_errors
+                                      ? new_ecc_errors - gpu_info->baseline_ecc_errors : 0;
+    }
+  }
+
+  // Error/correction/ECC counts from display-ready fields
   cache->total_errors = gpu_info->display_errors;
   cache->total_corrections = gpu_info->display_corrections;
+  cache->total_ecc_errors = gpu_info->display_ecc_errors;
 
   gpu_info->cached_nvlink_info_populated = true;
 }
@@ -1389,6 +1416,7 @@ void nvtop_reset_nvlink_cache(struct gpu_info *_gpu_info) {
   memset(&gpu_info->cached_nvlink_info, 0, sizeof(gpu_info->cached_nvlink_info));
   gpu_info->baseline_errors = 0;
   gpu_info->baseline_corrections = 0;
+  gpu_info->baseline_ecc_errors = 0;
   gpu_info->nvlink_errors_baseline_read = false;
   gpu_info->nvlink_last_tx = 0;
   gpu_info->nvlink_last_rx = 0;
diff --git a/src/interface.c b/src/interface.c
index 2db11eca..c72e8094 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -46,7 +46,7 @@ static unsigned int sizeof_device_field[device_field_count] = {
     [device_name] = 11,       [device_fan_speed] = 11,   [device_temperature] = 10, [device_power] = 15,
     [device_clock] = 11,      [device_mem_clock] = 12,   [device_pcie] = 46,        [device_shadercores] = 7,
     [device_l2features] = 11, [device_execengines] = 11,
-    [device_nvlink_errors] = 19,
+    [device_nvlink_errors] = 28,
 };
 
 // True if any monitored device has NVLink hardware support (even if 0 links active).
@@ -1050,11 +1050,11 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
 
       wnoutrefresh(dev->exec_engines);
 
-      // NVLink errors/corrections (conditional on NVLink)
+      // NVLink errors/corrections/ECC (conditional on NVLink)
       if (dev->nvlink_errors != NULL) {
         werase(dev->nvlink_errors);
-        unsigned long long err_cnt = 0, cor_cnt = 0;
-        if (nvtop_get_nvlink_error_counts(device, &err_cnt, &cor_cnt)) {
+        unsigned long long err_cnt = 0, cor_cnt = 0, ecc_cnt = 0;
+        if (nvtop_get_nvlink_error_counts(device, &err_cnt, &cor_cnt, &ecc_cnt)) {
           wcolor_set(dev->nvlink_errors, cyan_color, NULL);
           wprintw(dev->nvlink_errors, "NVL");
           wstandend(dev->nvlink_errors);
@@ -1067,6 +1067,11 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
           if (cor_cnt > 0)
             wcolor_set(dev->nvlink_errors, yellow_color, NULL);
           wprintw(dev->nvlink_errors, "%05u", (unsigned)(cor_cnt % 100000));
+          wstandend(dev->nvlink_errors);
+          wprintw(dev->nvlink_errors, " X:");
+          if (ecc_cnt > 0)
+            wcolor_set(dev->nvlink_errors, red_color, NULL);
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(ecc_cnt % 100000));
         }
         wnoutrefresh(dev->nvlink_errors);
       }
diff --git a/src/nvlink_nvidia_disabled.c b/src/nvlink_nvidia_disabled.c
index 1e4750ef..124b0925 100644
--- a/src/nvlink_nvidia_disabled.c
+++ b/src/nvlink_nvidia_disabled.c
@@ -14,10 +14,12 @@ unsigned nvtop_get_nvlink_info(struct gpu_info *gpu_info, struct nvlink_info *nv
 
 bool nvtop_get_nvlink_error_counts(struct gpu_info *gpu_info,
                                     unsigned long long *out_errors,
-                                    unsigned long long *out_corrections) {
+                                    unsigned long long *out_corrections,
+                                    unsigned long long *out_ecc) {
   (void)gpu_info;
   (void)out_errors;
   (void)out_corrections;
+  (void)out_ecc;
   return false;
 }
 

From df42cc2e0164656c0b035ce40d97c45eb97d447a Mon Sep 17 00:00:00 2001
From: Dan <220160+danbedford@users.noreply.github.com>
Date: Thu, 7 May 2026 12:07:26 -0400
Subject: [PATCH 31/31] ui: use 2-letter labels for NVLink counters (FL/EE/CR)

- FL: FLIT errors (red if >0)
- EE: ECC data errors (red if >0)
- CR: CRC corrections (yellow if >0)
- Errors grouped together (FL/EE), corrections last (CR)
- Window width expanded from 28 to 31 chars
---
 src/interface.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/interface.c b/src/interface.c
index c72e8094..dd117d8c 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -46,7 +46,7 @@ static unsigned int sizeof_device_field[device_field_count] = {
     [device_name] = 11,       [device_fan_speed] = 11,   [device_temperature] = 10, [device_power] = 15,
     [device_clock] = 11,      [device_mem_clock] = 12,   [device_pcie] = 46,        [device_shadercores] = 7,
     [device_l2features] = 11, [device_execengines] = 11,
-    [device_nvlink_errors] = 28,
+    [device_nvlink_errors] = 33,
 };
 
 // True if any monitored device has NVLink hardware support (even if 0 links active).
@@ -1058,20 +1058,23 @@ static void draw_devices(struct list_head *devices, struct nvtop_interface *inte
           wcolor_set(dev->nvlink_errors, cyan_color, NULL);
           wprintw(dev->nvlink_errors, "NVL");
           wstandend(dev->nvlink_errors);
-          wprintw(dev->nvlink_errors, " E:");
+          // FLIT errors (field 38)
+          wprintw(dev->nvlink_errors, " FL:");
           if (err_cnt > 0)
             wcolor_set(dev->nvlink_errors, red_color, NULL);
           wprintw(dev->nvlink_errors, "%05u", (unsigned)(err_cnt % 100000));
           wstandend(dev->nvlink_errors);
-          wprintw(dev->nvlink_errors, " C:");
-          if (cor_cnt > 0)
-            wcolor_set(dev->nvlink_errors, yellow_color, NULL);
-          wprintw(dev->nvlink_errors, "%05u", (unsigned)(cor_cnt % 100000));
-          wstandend(dev->nvlink_errors);
-          wprintw(dev->nvlink_errors, " X:");
+          // ECC data errors (field 160)
+          wprintw(dev->nvlink_errors, " EE:");
           if (ecc_cnt > 0)
             wcolor_set(dev->nvlink_errors, red_color, NULL);
           wprintw(dev->nvlink_errors, "%05u", (unsigned)(ecc_cnt % 100000));
+          wstandend(dev->nvlink_errors);
+          // CRC corrections (field 38)
+          wprintw(dev->nvlink_errors, " CR:");
+          if (cor_cnt > 0)
+            wcolor_set(dev->nvlink_errors, yellow_color, NULL);
+          wprintw(dev->nvlink_errors, "%05u", (unsigned)(cor_cnt % 100000));
         }
         wnoutrefresh(dev->nvlink_errors);
       }