diff --git a/docs/superpowers/plans/2026-04-14-vm-migration-test.md b/docs/superpowers/plans/2026-04-14-vm-migration-test.md new file mode 100644 index 0000000..2b7123c --- /dev/null +++ b/docs/superpowers/plans/2026-04-14-vm-migration-test.md @@ -0,0 +1,324 @@ +# VM Migration Integration Test — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an integration test that verifies VM migration between CRNs by creating an instance, unlinking its hosting CRN, and asserting the scheduler reallocates to a different CRN with SSH restored. + +**Architecture:** One new test file (`tests/test_migration.py`) with self-contained helpers, plus one new session-scoped fixture (`crn_nodes`) in `tests/conftest.py`. The test follows the same patterns as the existing `test_instances.py` — polling helpers, HTTP GET, subprocess SSH. + +**Tech Stack:** Python 3.10+, pytest, pytest-timeout, aleph CLI, urllib + +--- + +## File Map + +| File | Action | Responsibility | +|------|--------|----------------| +| `tests/conftest.py` | Modify (append fixture) | Add `crn_nodes` session fixture | +| `tests/test_migration.py` | Create | Migration test + helpers | + +--- + +### Task 1: Add `crn_nodes` fixture to conftest + +**Files:** +- Modify: `tests/conftest.py` (append after line 261) + +- [ ] **Step 1: Add the `crn_nodes` fixture** + +Append to the end of `tests/conftest.py`: + +```python +@pytest.fixture(scope="session") +def crn_nodes(ccn_aggregates): + """Registered CRN entries from the corechannel aggregate. + + Returns a list of dicts, each with at least 'hash' and 'address' keys. + Requires CRN_COUNT=2 (or more) during provisioning. + """ + NODESTATUS_ADDR = "0x70997970C51812dc3A010C7d01b50e0d17dc79C8" + agg = ccn_aggregates(NODESTATUS_ADDR, "corechannel") + if agg is None: + pytest.skip("No corechannel aggregate — migration tests require registered CRNs") + resource_nodes = agg.get("resource_nodes", []) + if len(resource_nodes) < 2: + pytest.skip( + f"Need at least 2 CRNs for migration tests, found {len(resource_nodes)}" + ) + return resource_nodes +``` + +Note: we use `pytest.skip` instead of `assert` so that the migration test is skipped gracefully when run against a single-CRN environment rather than failing hard. + +- [ ] **Step 2: Verify the fixture loads without error** + +Run: `cd /home/olivier/git/aleph/aleph-testnets && python -c "import ast; ast.parse(open('tests/conftest.py').read()); print('OK')"` + +Expected: `OK` + +- [ ] **Step 3: Commit** + +```bash +git add tests/conftest.py +git commit -m "feat: add crn_nodes fixture for multi-CRN tests" +``` + +--- + +### Task 2: Create `test_migration.py` with helpers and test skeleton + +**Files:** +- Create: `tests/test_migration.py` + +- [ ] **Step 1: Create the test file with helpers and the full test** + +Write `tests/test_migration.py`: + +```python +"""Integration test for VM migration between CRNs. + +Flow: create instance → verify SSH on initial CRN → unlink that CRN → +verify scheduler reallocates to a different CRN → verify SSH on new CRN. + +Requires: +- Two CRNs provisioned and linked (CRN_COUNT=2) +- Scheduler-rs with dispatch enabled +- Ubuntu rootfs image (ALEPH_TESTNET_ROOTFS) +""" +import json +import subprocess +import time +import urllib.request +import urllib.error +from urllib.parse import urlparse + +import pytest + + +def _poll(description, fetch, timeout, interval=5): + """Poll fetch() until it returns a truthy value or timeout is reached. + + fetch() should return the result on success or None to keep polling. + It may raise to keep polling (exceptions are swallowed until timeout). + """ + deadline = time.time() + timeout + last_err = None + while time.time() < deadline: + try: + result = fetch() + if result is not None: + return result + except Exception as e: + last_err = e + time.sleep(interval) + pytest.fail(f"{description} did not succeed within {timeout}s (last error: {last_err})") + + +def _http_get_json(url): + """GET a URL and return parsed JSON, or None on HTTP error.""" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + try: + resp = urllib.request.urlopen(req, timeout=10) + return json.loads(resp.read()) + except (urllib.error.HTTPError, urllib.error.URLError, OSError): + return None + + +def _crn_base_url(raw_url): + """Normalize a CRN URL to http://host:port.""" + parsed = urlparse(raw_url) + host = parsed.hostname + port = parsed.port or 4020 + return f"http://{host}:{port}", host + + +def _find_crn_hash(crn_nodes, allocation_url): + """Match a scheduler allocation URL to a CRN node hash.""" + alloc_host = urlparse(allocation_url).hostname + for node in crn_nodes: + if urlparse(node["address"]).hostname == alloc_host: + return node["hash"] + pytest.fail( + f"No CRN in corechannel aggregate matches allocation URL {allocation_url}" + ) + + +def _wait_for_ssh(private_key_path, host, port, timeout=60): + """Poll SSH until 'echo hello' succeeds on the given host:port.""" + def try_ssh(): + result = subprocess.run( + [ + "ssh", + "-i", private_key_path, + "-p", str(port), + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=5", + f"root@{host}", + "echo hello", + ], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode == 0 and "hello" in result.stdout: + return result.stdout.strip() + return None + + return _poll(f"SSH into {host}:{port}", try_ssh, timeout=timeout) + + +def _wait_for_vm_ssh_port(crn_base, instance_hash, timeout=120): + """Poll a CRN's execution list until the VM is running with a mapped SSH port.""" + def fetch_ssh_port(): + data = _http_get_json(f"{crn_base}/v2/about/executions/list") + if not data: + return None + execution = data.get(instance_hash) + if not execution: + return None + if not execution.get("running"): + return None + networking = execution.get("networking", {}) + mapped_ports = networking.get("mapped_ports", {}) + port_22 = mapped_ports.get("22") or mapped_ports.get(22) + if port_22 and port_22.get("host"): + return int(port_22["host"]) + return None + + return _poll(f"VM boot + SSH port on {crn_base}", fetch_ssh_port, timeout=timeout) + + +@pytest.mark.timeout(900) +def test_instance_migration( + aleph_cli, rootfs_image, ssh_key_pair, scheduler_api_url, ccn_url, crn_nodes +): + """End-to-end: create instance → SSH → unlink CRN → scheduler migrates → SSH on new CRN.""" + private_key_path, public_key_path = ssh_key_pair + + # --- Phase 1: Create instance and verify on initial CRN --- + + # Upload rootfs + upload_result = aleph_cli( + "file", "upload", rootfs_image, "--storage-engine", "storage", parse_json=True + ) + rootfs_hash = upload_result["item_hash"] + assert rootfs_hash, "Upload should return an item_hash" + + # Create instance + instance_result = aleph_cli( + "instance", "create", + "--rootfs", rootfs_hash, + "--rootfs-size", "4GiB", + "--ssh-pubkey-file", public_key_path, + "--vcpus", "1", + "--memory", "2GiB", + parse_json=True, + ) + instance_hash = instance_result["item_hash"] + assert instance_hash, "Instance create should return an item_hash" + + # Poll scheduler-api for initial allocation + def fetch_allocation(): + data = _http_get_json( + f"{scheduler_api_url}/api/v0/allocation/{instance_hash}" + ) + if data and data.get("node", {}).get("url"): + return data + return None + + allocation = _poll("Scheduler allocation", fetch_allocation, timeout=180) + initial_crn_url = allocation["node"]["url"] + assert initial_crn_url, "Allocation should include a CRN URL" + + initial_crn_base, initial_crn_host = _crn_base_url(initial_crn_url) + + # Wait for VM to boot on initial CRN + ssh_port = _wait_for_vm_ssh_port(initial_crn_base, instance_hash, timeout=120) + + # SSH baseline check + output = _wait_for_ssh(private_key_path, initial_crn_host, ssh_port, timeout=60) + assert "hello" in output + + # --- Phase 2: Unlink the initial CRN and verify migration --- + + # Find the CRN's node hash + crn_hash = _find_crn_hash(crn_nodes, initial_crn_url) + + # Unlink the CRN from the CCN + aleph_cli("node", "unlink", "--crn", crn_hash) + + # Poll scheduler-api until the allocation moves to a different CRN + def fetch_new_allocation(): + data = _http_get_json( + f"{scheduler_api_url}/api/v0/allocation/{instance_hash}" + ) + if not data or not data.get("node", {}).get("url"): + return None + new_url = data["node"]["url"] + # Must be a *different* CRN than the initial one + new_host = urlparse(new_url).hostname + if new_host != initial_crn_host: + return data + return None + + new_allocation = _poll( + "Scheduler reallocation to new CRN", fetch_new_allocation, timeout=300 + ) + new_crn_url = new_allocation["node"]["url"] + new_crn_base, new_crn_host = _crn_base_url(new_crn_url) + + assert new_crn_host != initial_crn_host, ( + f"Scheduler should migrate to a different CRN, " + f"but got the same host: {new_crn_host}" + ) + + # Wait for VM to boot on the new CRN + new_ssh_port = _wait_for_vm_ssh_port(new_crn_base, instance_hash, timeout=180) + + # SSH into the migrated VM + output = _wait_for_ssh(private_key_path, new_crn_host, new_ssh_port, timeout=60) + assert "hello" in output +``` + +- [ ] **Step 2: Verify syntax** + +Run: `cd /home/olivier/git/aleph/aleph-testnets && python -c "import ast; ast.parse(open('tests/test_migration.py').read()); print('OK')"` + +Expected: `OK` + +- [ ] **Step 3: Verify pytest collects the test** + +Run: `cd /home/olivier/git/aleph/aleph-testnets && python -m pytest tests/test_migration.py --collect-only 2>&1 | head -20` + +Expected: output shows `` collected (it may show skip markers for missing env vars, that's fine). + +- [ ] **Step 4: Commit** + +```bash +git add tests/test_migration.py +git commit -m "feat: add VM migration integration test + +Tests the full migration lifecycle: create instance, verify SSH on +initial CRN, unlink that CRN, wait for scheduler to reallocate to a +different CRN, verify SSH on the new CRN. + +The test is written ahead of the scheduler-rs migration implementation +to define the expected contract." +``` + +--- + +### Task 3: Verify everything together + +- [ ] **Step 1: Run full test collection to check for import/fixture errors** + +Run: `cd /home/olivier/git/aleph/aleph-testnets && python -m pytest tests/ --collect-only 2>&1 | tail -20` + +Expected: all tests collected without import errors. `test_instance_migration` should appear in the list. + +- [ ] **Step 2: Run just the migration test in dry-run mode to verify fixture wiring** + +Run: `cd /home/olivier/git/aleph/aleph-testnets && python -m pytest tests/test_migration.py -v --collect-only 2>&1` + +Expected: shows `test_instance_migration` with its fixture dependencies resolved (or skipped due to missing env vars — either is acceptable). diff --git a/docs/superpowers/specs/2026-04-14-vm-migration-test-design.md b/docs/superpowers/specs/2026-04-14-vm-migration-test-design.md new file mode 100644 index 0000000..1a0daff --- /dev/null +++ b/docs/superpowers/specs/2026-04-14-vm-migration-test-design.md @@ -0,0 +1,141 @@ +# VM Migration Integration Test — Design Spec + +## Goal + +Add an integration test that validates VM migration between CRNs. The test creates a VM, observes its initial allocation, unlinks the hosting CRN, and asserts that the scheduler migrates the instance to a different CRN where it becomes reachable via SSH again. + +This test is being written ahead of the scheduler-rs migration implementation. It defines the contract that the scheduler and aleph-vm must fulfill. + +## Prerequisites + +- Two CRNs provisioned and registered (`CRN_COUNT=2`) +- Both CRNs linked to a CCN in the corechannel aggregate +- Scheduler-rs running with dispatch enabled +- Ubuntu rootfs image available + +## Test Flow + +### `test_instance_migration` (file: `tests/test_migration.py`) + +``` + 1. Upload rootfs to CCN (storage engine) + 2. Create instance (1 vCPU, 2 GiB RAM, 4 GiB rootfs, SSH key) + 3. Poll scheduler-api /api/v0/allocation/{hash} for initial allocation + → Capture: initial CRN URL + 4. Poll initial CRN /v2/about/executions/list for running VM + SSH port + 5. SSH into VM on initial CRN, verify connectivity (echo hello) + 6. Identify the initial CRN's node hash from the corechannel aggregate + 7. Run: aleph node unlink --crn + 8. Poll scheduler-api /api/v0/allocation/{hash} until CRN URL differs + from the initial one + → Capture: new CRN URL + 9. Poll new CRN /v2/about/executions/list for running VM + SSH port +10. SSH into VM on new CRN, verify connectivity (echo hello) +``` + +### Timeout Budget + +Total test timeout: **900 seconds** (15 minutes). + +Breakdown: +- Steps 1-2 (upload + create): ~30s +- Step 3 (initial allocation): up to 180s +- Steps 4-5 (VM boot + SSH on first CRN): up to 180s +- Step 6-7 (identify + unlink): ~10s +- Step 8 (reallocation): up to 300s — generous because the scheduler + migration logic is in development +- Steps 9-10 (VM boot + SSH on new CRN): up to 180s + +Individual poll timeouts are set within these bounds. The 900s outer +timeout is a hard ceiling. + +## New Fixtures + +### `crn_nodes` (session-scoped, in `conftest.py`) + +Reads the corechannel aggregate from the nodestatus address and returns +the list of registered CRN entries. Each entry contains at minimum: + +- `hash` — the CRN's node hash (used for unlink) +- `address` — the CRN's HTTP address (e.g. `http://1.2.3.4:4020`) + +```python +@pytest.fixture(scope="session") +def crn_nodes(ccn_aggregates): + """Registered CRN entries from the corechannel aggregate.""" + NODESTATUS_ADDR = "0x70997970C51812dc3A010C7d01b50e0d17dc79C8" + agg = ccn_aggregates(NODESTATUS_ADDR, "corechannel") + assert agg is not None, "No corechannel aggregate found" + resource_nodes = agg.get("resource_nodes", []) + assert len(resource_nodes) >= 2, ( + f"Need at least 2 CRNs, found {len(resource_nodes)}" + ) + return resource_nodes +``` + +The test matches the scheduler-api's allocation URL against +`crn_nodes[*].address` to identify which CRN was picked and derive +its hash for the unlink command. + +## Test Code Structure + +### `tests/test_migration.py` + +Self-contained test file with: + +- `_poll(description, fetch, timeout, interval)` — generic polling + helper (same pattern as `test_instances.py`) +- `_http_get_json(url)` — HTTP GET + JSON parse helper +- `test_instance_migration(...)` — the test function + +Fixtures used: `aleph_cli`, `rootfs_image`, `ssh_key_pair`, +`scheduler_api_url`, `ccn_url`, `crn_nodes`. + +### Matching allocation to CRN hash + +The scheduler-api returns `node.url` (e.g. `http://1.2.3.4:4020`). +The corechannel aggregate stores `address` for each CRN in the same +format. The test matches on hostname to find the CRN hash: + +```python +from urllib.parse import urlparse + +def find_crn_hash(crn_nodes, allocation_url): + alloc_host = urlparse(allocation_url).hostname + for node in crn_nodes: + if urlparse(node["address"]).hostname == alloc_host: + return node["hash"] + pytest.fail(f"No CRN in aggregate matches allocation URL {allocation_url}") +``` + +### SSH verification + +Same approach as `test_instances.py`: SSH to `root@` on the +mapped port with the ephemeral key, run `echo hello`, check output. + +## Files Changed + +| File | Change | +|------|--------| +| `tests/test_migration.py` | New — migration integration test | +| `tests/conftest.py` | Add `crn_nodes` fixture | + +## Files NOT Changed + +| File | Reason | +|------|--------| +| `scripts/crn-up.sh` | Already supports `CRN_COUNT` env var | +| `scripts/local-up.sh` | `CRN_COUNT` is already env-driven | +| `deploy/docker-compose.yml` | No scheduler config changes needed | + +## Out of Scope + +- **Volume data integrity**: writing a file before migration and reading + it after. Valuable follow-up test, but not in this iteration. +- **Ungraceful migration**: node crash / downtime detection. The unlink + path is the only clean migration trigger. +- **Sequential migrations**: migrating the same instance multiple times. +- **IPv6 verification**: the spec says "new IPv6 address" but the test + verifies reachability via SSH on the mapped port (IPv4 to CRN, which + forwards to the VM). Direct IPv6 connectivity testing can be added + later. diff --git a/tests/conftest.py b/tests/conftest.py index 550d3ac..af6c274 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -259,3 +259,22 @@ def call(to: str, sig: str, *args: str) -> str: ) return result.stdout.strip() return call + + +@pytest.fixture(scope="session") +def crn_nodes(ccn_aggregates): + """Registered CRN entries from the corechannel aggregate. + + Returns a list of dicts, each with at least 'hash' and 'address' keys. + Requires CRN_COUNT=2 (or more) during provisioning. + """ + NODESTATUS_ADDR = "0x70997970C51812dc3A010C7d01b50e0d17dc79C8" + agg = ccn_aggregates(NODESTATUS_ADDR, "corechannel") + if agg is None: + pytest.skip("No corechannel aggregate — migration tests require registered CRNs") + resource_nodes = agg.get("resource_nodes", []) + if len(resource_nodes) < 2: + pytest.skip( + f"Need at least 2 CRNs for migration tests, found {len(resource_nodes)}" + ) + return resource_nodes diff --git a/tests/test_migration.py b/tests/test_migration.py new file mode 100644 index 0000000..5313a18 --- /dev/null +++ b/tests/test_migration.py @@ -0,0 +1,204 @@ +"""Integration test for VM migration between CRNs. + +Flow: create instance → verify SSH on initial CRN → unlink that CRN → +verify scheduler reallocates to a different CRN → verify SSH on new CRN. + +Requires: +- Two CRNs provisioned and linked (CRN_COUNT=2) +- Scheduler-rs with dispatch enabled +- Ubuntu rootfs image (ALEPH_TESTNET_ROOTFS) +""" +import json +import subprocess +import time +import urllib.request +import urllib.error +from urllib.parse import urlparse + +import pytest + + +def _poll(description, fetch, timeout, interval=5): + """Poll fetch() until it returns a truthy value or timeout is reached. + + fetch() should return the result on success or None to keep polling. + It may raise to keep polling (exceptions are swallowed until timeout). + """ + deadline = time.time() + timeout + last_err = None + while time.time() < deadline: + try: + result = fetch() + if result is not None: + return result + except Exception as e: + last_err = e + time.sleep(interval) + pytest.fail(f"{description} did not succeed within {timeout}s (last error: {last_err})") + + +def _http_get_json(url): + """GET a URL and return parsed JSON, or None on HTTP error.""" + req = urllib.request.Request(url, headers={"Accept": "application/json"}) + try: + resp = urllib.request.urlopen(req, timeout=10) + return json.loads(resp.read()) + except (urllib.error.HTTPError, urllib.error.URLError, OSError): + return None + + +def _crn_base_url(raw_url) -> tuple[str, str]: + """Normalize a CRN URL to (base_url, hostname).""" + parsed = urlparse(raw_url) + host = parsed.hostname + port = parsed.port or 4020 + return f"http://{host}:{port}", host + + +def _find_crn_hash(crn_nodes, allocation_url): + """Match a scheduler allocation URL to a CRN node hash.""" + alloc_host = urlparse(allocation_url).hostname + for node in crn_nodes: + if urlparse(node["address"]).hostname == alloc_host: + return node["hash"] + pytest.fail( + f"No CRN in corechannel aggregate matches allocation URL {allocation_url}" + ) + + +def _wait_for_ssh(private_key_path, host, port, timeout=60): + """Poll SSH until 'echo hello' succeeds on the given host:port.""" + def try_ssh(): + result = subprocess.run( + [ + "ssh", + "-i", private_key_path, + "-p", str(port), + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=5", + f"root@{host}", + "echo hello", + ], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode == 0 and "hello" in result.stdout: + return result.stdout.strip() + return None + + return _poll(f"SSH into {host}:{port}", try_ssh, timeout=timeout) + + +def _wait_for_vm_ssh_port(crn_base, instance_hash, timeout=120): + """Poll a CRN's execution list until the VM is running with a mapped SSH port.""" + def fetch_ssh_port(): + data = _http_get_json(f"{crn_base}/v2/about/executions/list") + if not data: + return None + execution = data.get(instance_hash) + if not execution: + return None + if not execution.get("running"): + return None + networking = execution.get("networking", {}) + mapped_ports = networking.get("mapped_ports", {}) + port_22 = mapped_ports.get("22") or mapped_ports.get(22) + if port_22 and port_22.get("host"): + return int(port_22["host"]) + return None + + return _poll(f"VM boot + SSH port on {crn_base}", fetch_ssh_port, timeout=timeout) + + +@pytest.mark.timeout(900) +def test_instance_migration( + aleph_cli, rootfs_image, ssh_key_pair, scheduler_api_url, crn_nodes +): + """End-to-end: create instance → SSH → unlink CRN → scheduler migrates → SSH on new CRN.""" + private_key_path, public_key_path = ssh_key_pair + + # --- Phase 1: Create instance and verify on initial CRN --- + + # Upload rootfs + upload_result = aleph_cli( + "file", "upload", rootfs_image, "--storage-engine", "storage", parse_json=True + ) + rootfs_hash = upload_result["item_hash"] + assert rootfs_hash, "Upload should return an item_hash" + + # Create instance + instance_result = aleph_cli( + "instance", "create", + "--rootfs", rootfs_hash, + "--rootfs-size", "4GiB", + "--ssh-pubkey-file", public_key_path, + "--vcpus", "1", + "--memory", "2GiB", + parse_json=True, + ) + instance_hash = instance_result["item_hash"] + assert instance_hash, "Instance create should return an item_hash" + + # Poll scheduler-api for initial allocation + def fetch_allocation(): + data = _http_get_json( + f"{scheduler_api_url}/api/v0/allocation/{instance_hash}" + ) + if data and data.get("node", {}).get("url"): + return data + return None + + allocation = _poll("Scheduler allocation", fetch_allocation, timeout=180) + initial_crn_url = allocation["node"]["url"] + assert initial_crn_url, "Allocation should include a CRN URL" + + initial_crn_base, initial_crn_host = _crn_base_url(initial_crn_url) + + # Wait for VM to boot on initial CRN + ssh_port = _wait_for_vm_ssh_port(initial_crn_base, instance_hash, timeout=120) + + # SSH baseline check + output = _wait_for_ssh(private_key_path, initial_crn_host, ssh_port, timeout=60) + assert "hello" in output + + # --- Phase 2: Unlink the initial CRN and verify migration --- + + # Find the CRN's node hash + crn_hash = _find_crn_hash(crn_nodes, initial_crn_url) + + # Unlink the CRN from the CCN + aleph_cli("node", "unlink", "--crn", crn_hash) + + # Poll scheduler-api until the allocation moves to a different CRN + def fetch_new_allocation(): + data = _http_get_json( + f"{scheduler_api_url}/api/v0/allocation/{instance_hash}" + ) + if not data or not data.get("node", {}).get("url"): + return None + new_url = data["node"]["url"] + # Must be a *different* CRN than the initial one + new_host = urlparse(new_url).hostname + if new_host != initial_crn_host: + return data + return None + + new_allocation = _poll( + "Scheduler reallocation to new CRN", fetch_new_allocation, timeout=300 + ) + new_crn_url = new_allocation["node"]["url"] + new_crn_base, new_crn_host = _crn_base_url(new_crn_url) + + assert new_crn_host != initial_crn_host, ( + f"Scheduler should migrate to a different CRN, " + f"but got the same host: {new_crn_host}" + ) + + # Wait for VM to boot on the new CRN + new_ssh_port = _wait_for_vm_ssh_port(new_crn_base, instance_hash, timeout=180) + + # SSH into the migrated VM + output = _wait_for_ssh(private_key_path, new_crn_host, new_ssh_port, timeout=60) + assert "hello" in output