Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions apps/provider-inventory/src/config/env.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ import { z } from "zod";

export const envSchema = z.object({
PROVIDER_INVENTORY_POSTGRES_URL: z.string(),
DRIZZLE_MIGRATIONS_FOLDER: z.string().optional().default("./drizzle"),
LOG_LEVEL: z.enum(["fatal", "error", "warn", "info", "debug", "trace"]).optional().default("info"),
STD_OUT_LOG_FORMAT: z.enum(["json", "pretty"]).optional().default("json"),
NODE_ENV: z.enum(["development", "production", "test"]).optional().default("development"),
PORT: z.number({ coerce: true }).optional().default(3092)
DRIZZLE_MIGRATIONS_FOLDER: z.string().default("./drizzle"),
LOG_LEVEL: z.enum(["fatal", "error", "warn", "info", "debug", "trace"]).default("info"),
STD_OUT_LOG_FORMAT: z.enum(["json", "pretty"]).default("json"),
PORT: z.number({ coerce: true }).default(3092),
DISCOVERY_INTERVAL_MS: z.number({ coerce: true }).default(10 * 60 * 1000) // 10 minutes
});

export type EnvConfig = z.infer<typeof envSchema>;
8 changes: 7 additions & 1 deletion apps/provider-inventory/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ import { container } from "tsyringe";

import { APP_CONFIG } from "@src/providers/app-config.provider";
import { healthzRouter } from "@src/routes";
import { DiscoverySchedulerService } from "@src/services/discovery-scheduler/discovery-scheduler.service";
import { HonoErrorHandlerService } from "@src/services/hono-error-handler/hono-error-handler.service";
import { ProviderInventoryWriterService } from "@src/services/provider-inventory-writer/provider-inventory-writer.service";
import { startServer } from "@src/services/start-server/start-server";
import type { AppEnv } from "@src/types/app-context";

Expand All @@ -28,6 +30,10 @@ export async function bootstrap(): Promise<void> {
const app = createApp();

await startServer(app, createOtelLogger({ context: "APP" }), process, {
port: container.resolve(APP_CONFIG).PORT
port: container.resolve(APP_CONFIG).PORT,
beforeStart: async () => {
await container.resolve(ProviderInventoryWriterService).resetOnlineSince();
container.resolve(DiscoverySchedulerService).start();
}
});
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
import { describe, expect, it } from "vitest";

import { computeRollups } from "./compute-rollups";

describe(computeRollups.name, () => {
it("returns all zeros for an empty cluster", () => {
const result = computeRollups({ nodes: [], storage: [] });

expect(result).toEqual({
totalAvailableCpu: 0n,
totalAvailableMemory: 0n,
totalAvailableGpu: 0n,
totalAvailableEph: 0n,
totalAvailablePersistent: 0n,
maxNodeFreeCpu: 0n,
maxNodeFreeMemory: 0n,
maxNodeFreeGpu: 0n,
gpuModels: [],
storageClasses: []
});
});

it("computes rollups for a single node", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 4000 },
memory: { available: 8_000_000_000 },
gpu: [{ vendor: "nvidia", model: "a100", available: 2 }],
ephStorage: { available: 100_000_000_000 },
persistentStorage: [{ class: "beta2", available: 500_000_000_000 }]
}
],
storage: []
});

expect(result.totalAvailableCpu).toBe(4000n);
expect(result.totalAvailableMemory).toBe(8_000_000_000n);
expect(result.totalAvailableGpu).toBe(2n);
expect(result.totalAvailableEph).toBe(100_000_000_000n);
expect(result.totalAvailablePersistent).toBe(500_000_000_000n);
expect(result.maxNodeFreeCpu).toBe(4000n);
expect(result.maxNodeFreeMemory).toBe(8_000_000_000n);
expect(result.maxNodeFreeGpu).toBe(2n);
expect(result.gpuModels).toEqual(["nvidia/a100"]);
expect(result.storageClasses).toEqual(["beta2"]);
});

it("sums totals across multiple nodes and tracks max-per-node", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 2000 },
memory: { available: 4_000_000_000 },
gpu: [{ vendor: "nvidia", model: "a100", available: 1 }],
ephStorage: { available: 50_000_000_000 },
persistentStorage: []
},
{
name: "node-2",
cpu: { available: 8000 },
memory: { available: 16_000_000_000 },
gpu: [{ vendor: "nvidia", model: "a100", available: 4 }],
ephStorage: { available: 200_000_000_000 },
persistentStorage: [{ class: "beta2", available: 1_000_000_000_000 }]
}
],
storage: []
});

expect(result.totalAvailableCpu).toBe(10_000n);
expect(result.totalAvailableMemory).toBe(20_000_000_000n);
expect(result.totalAvailableGpu).toBe(5n);
expect(result.totalAvailableEph).toBe(250_000_000_000n);
expect(result.totalAvailablePersistent).toBe(1_000_000_000_000n);
expect(result.maxNodeFreeCpu).toBe(8000n);
expect(result.maxNodeFreeMemory).toBe(16_000_000_000n);
expect(result.maxNodeFreeGpu).toBe(4n);
});

it("deduplicates GPU models across nodes", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 1000 },
memory: { available: 1000 },
gpu: [
{ vendor: "nvidia", model: "a100", available: 1 },
{ vendor: "amd", model: "mi300x", available: 1 }
],
ephStorage: { available: 0 },
persistentStorage: []
},
{
name: "node-2",
cpu: { available: 1000 },
memory: { available: 1000 },
gpu: [{ vendor: "nvidia", model: "a100", available: 2 }],
ephStorage: { available: 0 },
persistentStorage: []
}
],
storage: []
});

expect(result.gpuModels).toEqual(["amd/mi300x", "nvidia/a100"]);
});

it("handles ephemeral-only storage", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 1000 },
memory: { available: 1000 },
gpu: [],
ephStorage: { available: 500_000_000_000 },
persistentStorage: []
}
],
storage: []
});

expect(result.totalAvailableEph).toBe(500_000_000_000n);
expect(result.totalAvailablePersistent).toBe(0n);
expect(result.storageClasses).toEqual([]);
});

it("handles persistent-only storage with multiple classes", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 1000 },
memory: { available: 1000 },
gpu: [],
ephStorage: { available: 0 },
persistentStorage: [
{ class: "beta2", available: 100_000_000_000 },
{ class: "beta3", available: 200_000_000_000 }
]
}
],
storage: []
});

expect(result.totalAvailablePersistent).toBe(300_000_000_000n);
expect(result.storageClasses).toEqual(["beta2", "beta3"]);
});

it("collects storage classes from both nodes and cluster-level storage", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 0 },
memory: { available: 0 },
gpu: [],
ephStorage: { available: 0 },
persistentStorage: [{ class: "beta2", available: 100 }]
}
],
storage: [{ class: "beta3", available: 500 }]
});

expect(result.storageClasses).toEqual(["beta2", "beta3"]);
});

it("handles nodes with no GPUs", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 4000 },
memory: { available: 8_000_000_000 },
gpu: [],
ephStorage: { available: 100_000_000_000 },
persistentStorage: []
}
],
storage: []
});

expect(result.totalAvailableGpu).toBe(0n);
expect(result.maxNodeFreeGpu).toBe(0n);
expect(result.gpuModels).toEqual([]);
});

it("clamps negative values to zero (overcommit)", () => {
const result = computeRollups({
nodes: [
{
name: "overcommitted",
cpu: { available: -500 },
memory: { available: -1_000_000 },
gpu: [{ vendor: "nvidia", model: "a100", available: -1 }],
ephStorage: { available: -100 },
persistentStorage: [{ class: "beta2", available: -200 }]
}
],
storage: []
});

expect(result.totalAvailableCpu).toBe(0n);
expect(result.totalAvailableMemory).toBe(0n);
expect(result.totalAvailableGpu).toBe(0n);
expect(result.totalAvailableEph).toBe(0n);
expect(result.totalAvailablePersistent).toBe(0n);
expect(result.maxNodeFreeCpu).toBe(0n);
expect(result.maxNodeFreeMemory).toBe(0n);
expect(result.maxNodeFreeGpu).toBe(0n);
});

it("handles all-zero capacity", () => {
const result = computeRollups({
nodes: [
{
name: "idle",
cpu: { available: 0 },
memory: { available: 0 },
gpu: [],
ephStorage: { available: 0 },
persistentStorage: []
}
],
storage: []
});

expect(result.totalAvailableCpu).toBe(0n);
expect(result.totalAvailableMemory).toBe(0n);
expect(result.maxNodeFreeCpu).toBe(0n);
expect(result.maxNodeFreeMemory).toBe(0n);
});

it("sums GPU count per node for max-node-free-gpu", () => {
const result = computeRollups({
nodes: [
{
name: "node-1",
cpu: { available: 0 },
memory: { available: 0 },
gpu: [
{ vendor: "nvidia", model: "a100", available: 2 },
{ vendor: "nvidia", model: "h100", available: 3 }
],
ephStorage: { available: 0 },
persistentStorage: []
},
{
name: "node-2",
cpu: { available: 0 },
memory: { available: 0 },
gpu: [{ vendor: "nvidia", model: "a100", available: 4 }],
ephStorage: { available: 0 },
persistentStorage: []
}
],
storage: []
});

expect(result.maxNodeFreeGpu).toBe(5n);
expect(result.totalAvailableGpu).toBe(9n);
expect(result.gpuModels).toEqual(["nvidia/a100", "nvidia/h100"]);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import type { Inventory, InventoryRollups } from "@src/types/inventory";

export function computeRollups(inventory: Inventory): InventoryRollups {
let totalAvailableCpu = 0n;
let totalAvailableMemory = 0n;
let totalAvailableGpu = 0n;
let totalAvailableEph = 0n;
let totalAvailablePersistent = 0n;
let maxNodeFreeCpu = 0n;
let maxNodeFreeMemory = 0n;
let maxNodeFreeGpu = 0n;
const gpuModelSet = new Set<string>();
const storageClassSet = new Set<string>();

for (const node of inventory.nodes) {
const nodeCpu = clamp(node.cpu.available);
const nodeMemory = clamp(node.memory.available);
const nodeEph = clamp(node.ephStorage.available);

totalAvailableCpu += nodeCpu;
totalAvailableMemory += nodeMemory;
totalAvailableEph += nodeEph;

if (nodeCpu > maxNodeFreeCpu) maxNodeFreeCpu = nodeCpu;
if (nodeMemory > maxNodeFreeMemory) maxNodeFreeMemory = nodeMemory;

let nodeGpuTotal = 0n;
for (const gpu of node.gpu) {
const gpuCount = clamp(gpu.available);
nodeGpuTotal += gpuCount;
totalAvailableGpu += gpuCount;
if (gpu.vendor && gpu.model) {
gpuModelSet.add(`${gpu.vendor}/${gpu.model}`);
}
}
if (nodeGpuTotal > maxNodeFreeGpu) maxNodeFreeGpu = nodeGpuTotal;

for (const ps of node.persistentStorage) {
totalAvailablePersistent += clamp(ps.available);
if (ps.class) storageClassSet.add(ps.class);
}
}

for (const s of inventory.storage) {
if (s.class) storageClassSet.add(s.class);
}

return {
totalAvailableCpu,
totalAvailableMemory,
totalAvailableGpu,
totalAvailableEph,
totalAvailablePersistent,
maxNodeFreeCpu,
maxNodeFreeMemory,
maxNodeFreeGpu,
gpuModels: [...gpuModelSet].sort(),
storageClasses: [...storageClassSet].sort()
};
}

function clamp(value: number): bigint {
if (!Number.isFinite(value) || value <= 0) return 0n;
return BigInt(value);
}
Comment thread
stalniy marked this conversation as resolved.
Loading
Loading