diff --git a/scopes/toolbox/fs/last-modified/last-modified.ts b/scopes/toolbox/fs/last-modified/last-modified.ts index 13a72d81fbea..f88ea7afba67 100644 --- a/scopes/toolbox/fs/last-modified/last-modified.ts +++ b/scopes/toolbox/fs/last-modified/last-modified.ts @@ -4,16 +4,31 @@ import fs from 'fs-extra'; import { compact } from 'lodash'; /** - * check recursively all the sub-directories as well + * last-modified mtime of a component's directory structure (recursively), used as the dependency + * fs-cache freshness signal. + * + * the source tree is scanned recursively, but the scan stops at the `node_modules` boundary: the + * dependency auto-detect (whose cached result this guards) never traverses *into* a package, so the + * package internals and the transitive store are irrelevant — recursing `node_modules` followed the + * symlinked store and made this scan ~60x larger. Instead only the `node_modules` dir and its + * `@scope` dirs' mtimes are taken: a direct dep added / removed / version-relinked / componentId- + * changed all go through a relink that rewrites the symlink entry, bumping the containing dir. */ async function getLastModifiedDirTimestampMs(rootDir: string): Promise { - const allDirs = await globby(rootDir, { + // source subdirectories, excluding the deep node_modules subtree. + const sourceDirs = await globby(rootDir, { onlyDirectories: true, - // ignore: ['**/node_modules/**'], // need to think about it more. sometimes we do want to invalidate cache upon node_modules changes inside component dir - // stats: true // todo: consider retrieving the stats from here. + ignore: ['**/node_modules/**'], }); - allDirs.push(rootDir); - return getLastModifiedPathsTimestampMs(allDirs); + sourceDirs.push(rootDir); + // node_modules/@scope dirs (catch a scoped dep changing within an already-existing scope). a *bare* + // `node_modules` glob would recurse the whole symlinked tree, so its own dir mtime is taken via the + // direct stat in getLastModifiedPathsTimestampMs below, not globbed. + const scopeDirs = await globby(`${rootDir}/node_modules/@*`, { + onlyDirectories: true, + followSymbolicLinks: false, + }); + return getLastModifiedPathsTimestampMs([...sourceDirs, ...scopeDirs, `${rootDir}/node_modules`]); } export async function getLastModifiedPathsTimestampMs(paths: string[]): Promise { diff --git a/scopes/workspace/workspace/component-loading-redesign.md b/scopes/workspace/workspace/component-loading-redesign.md index 4391f3347118..08e032fff8fb 100644 --- a/scopes/workspace/workspace/component-loading-redesign.md +++ b/scopes/workspace/workspace/component-loading-redesign.md @@ -1,7 +1,7 @@ # Component Loading Redesign **Status:** Phase 1 shipped; Phase 2 in progress -**Last updated:** 2026-06-15 (code references are against `master` @ `59855b104`; line numbers will drift) +**Last updated:** 2026-06-23 (code references are against `master`; line numbers will drift) This document is the source of truth for a multi-phase effort to simplify Bit's component-loading mechanism: fewer caches, a staged (lazy) loading pipeline, a single env/aspect load planner, and a @@ -204,11 +204,18 @@ earlier ones teach us). ### Phase 2 — Quick perf wins on existing seams - [x] Benchmark harness committed + baseline recorded (see §4) — **gate for the rest of the phase** -- [ ] Lazy file contents in `ModelComponent.toConsumerComponent` +- [x] Restrict the deps-cache invalidation scan to the node*modules footprint the cache depends on: + the per-component freshness scan stops at the `node_modules` boundary (auto-detect never traverses + \_into* a package) and only takes each component's `node_modules`/`@scope` dir mtimes — a direct + dep added/removed/relinked bumps them. Cuts warm `bit status` fs syscalls ~37% (74.3k→46.4k); + warm-wall-neutral (I/O-wait), helps cold/CI (see §4.1). Kept per-component (no shared index — it + added complexity for zero syscall benefit and over-scanned single-component commands). +- [ ] `on-load` slot-laziness (`loadDocs: false, loadCompositions: false` for non-UI flows) — the + largest CPU-bound stage (9.2s) and the next **warm-wall** target +- [ ] Lazy file contents in `ModelComponent.toConsumerComponent` (helps `graph`, not `status`) - [ ] `bit deps usage`: ids + stored deps instead of full load - [ ] IDE metadata endpoint (`api-for-ide.ts`): S0-S2-level data only - [ ] `bit remove` / forking: drop full-component loads where only ids/paths are used -- [ ] Default `loadDocs: false, loadCompositions: false` for non-UI flows ### Phase 3 — Cache consolidation @@ -270,13 +277,13 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency **`bit status` (warm, ~13s wall):** -| stage | aggregate self-time | ~wall | note | -| ----------------------------------------------------------- | ------------------- | ----- | --------------------------------------------- | -| `legacy-load-deps` | 43s | ~7s | dependency-object materialization (see below) | -| `on-load` (slot handlers: docs, compositions, schema, pkg…) | 10s | ~1.7s | trimmable for non-UI flows | -| `dependency-resolution` (Harmony resolver) | 7.7s | ~1.3s | | -| `execute-load-slot` (own) | 5.8s | ~1s | | -| `consumer-fs-load` (file content reads) | negligible | — | not a `status` cost | +| stage | aggregate self-time | ~wall | note | +| ----------------------------------------------------------- | ------------------- | ----- | ----------------------------------------------------------- | +| `legacy-load-deps` | 43s | ~7s | filesystem traversal in deps-cache invalidation (see below) | +| `on-load` (slot handlers: docs, compositions, schema, pkg…) | 10s | ~1.7s | trimmable for non-UI flows | +| `dependency-resolution` (Harmony resolver) | 7.7s | ~1.3s | | +| `execute-load-slot` (own) | 5.8s | ~1s | | +| `consumer-fs-load` (file content reads) | negligible | — | not a `status` cost | **`bit graph --json` (warm, ~20s wall; ~7.5s of it is loading):** @@ -285,23 +292,38 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency | `consumer-fs-load` (file content reads) | 5.8s | dominant load cost for graph | | `legacy-load-deps` | 1.2s | graph uses a lighter load path | -**Key conclusions (validated, not hypothesized):** +**Key conclusions (validated by measurement; these _supersede_ the earlier "object materialization" +hypothesis, which deeper sub-step instrumentation disproved):** - **The dependency FS cache works.** On a warm `bit status`, dep loading is **635 cache hits, 0 misses/recomputes**. `legacy-load-deps` is _not_ re-resolving dependencies. -- **`status`'s dominant cost (~7s wall) is dependency-object _materialization on cache hit_** — - `DependenciesData.deserialize` + reconstructing full `Dependency`/`DependencyList` objects + - `applyOverrides`, for every component, even though `status` reads little of it. This is the - "all-or-nothing, always fully materialize" problem of §1.1 — **structural, not a cache bug.** - Reducing it needs the staged/lazy-loading work (defer dependency-object construction), **not a - Phase-2 quick fix.** Earlier framing of this as "39s" was aggregate-concurrent self-time, not - wall; wall is ~13s. -- **`graph`'s dominant load cost is file-content reads (`consumer-fs-load`, 5.8s)** → this is what - **lazy file contents** (§2.1) targets; validated as a real win for `graph`/scope-side loads, but - it does **not** help `status` (whose file reads are negligible). -- Implication for Phase-2 ordering: lazy file contents helps `graph`; per-command partial loads help - `deps usage`/IDE/`remove`/forking; `loadDocs/loadCompositions: false` trims `status`'s slot work - (~1.7s). The big `status` number is deferred to the staged-loading phase. +- **`legacy-load-deps` is filesystem I/O, not object materialization.** Sub-step timing of the warm + cache-hit path (313 components, aggregate self-time): `statFiles` **22.3s** + `cacheRead` **10.9s**, + versus `deserialize` **9ms**, `applyOverrides` 0.4s, `updateVersions` 0.1s. The cost is the + deps-cache _read + invalidation_ layer paid per component — **not** `DependenciesData.deserialize` / + `Dependency` reconstruction (negligible). `statFiles` was a recursive `globby` per component that + **followed the component's `node_modules` symlink into the shared workspace `node_modules`** (226k + of 230k scanned entries), run 313× per command. +- **Aggregate self-time ≠ wall — sharply.** Restricting the node*modules traversal to the footprint + the cache depends on (the dependency auto-detect never traverses \_into* a package, so only the + component's `node_modules`/`@scope` dir mtimes matter — a dep add/remove/relink bumps them; the deep + tree and transitive store are irrelevant) cut **fs syscalls 74.3k→46.4k per warm `bit status` + (~37%)** — yet a same-state wall A/B moved warm wall by **~0.3s**. The removed work is I/O-\_wait\* that overlaps with CPU on the single JS thread; on + a warm SSD it was never on the critical path (real win on cold/CI/networked filesystems, where it + is). `cacheRead` (10.9s) is likewise I/O-wait, so consolidating it would also be warm-wall-neutral. + (An earlier iteration _ignored_ node_modules entirely — faster still, but a correctness regression: + the cache resolves imports through node_modules, so its structure must invalidate the cache.) +- **The warm-wall bottleneck is CPU-bound, single-threaded JS** — the stages whose self-time ≈ their + wall contribution: **`on-load` slot handlers (9.2s)**, **`dependency-resolution` (7.2s)**, + **`workspace.get` (5s)**. These — not the deps-cache I/O — are what move warm `status` wall. +- **`graph`'s dominant load cost is file-content reads (`consumer-fs-load`, 5.8s)** → the target for + **lazy file contents** (§2.1); a real win for `graph`/scope-side loads, but it does **not** help + `status` (whose file reads are negligible). +- Implication for Phase-2 ordering: the deps-cache invalidation batch ships as a standalone fs/CPU + efficiency win (helps cold/CI, warm-wall-neutral). The next **warm-wall** target is `on-load` + slot-laziness (`loadDocs`/`loadCompositions: false` for non-UI flows) — the largest CPU-bound stage. + Earlier framing of the deps cost as "39s"/"materialization" was both an aggregate-vs-wall and a + cause misread; corrected here. --- @@ -310,7 +332,7 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency | Phase | State | OpenSpec change | PRs | | ----------------------- | ----------- | ------------------------------ | --------------------------------------------------- | | 1 — Observability | done | `component-load-observability` | [#10418](https://github.com/teambit/bit/pull/10418) | -| 2 — Quick perf wins | in progress | — | — | +| 2 — Quick perf wins | in progress | — | [#10445](https://github.com/teambit/bit/pull/10445) | | 3 — Cache consolidation | not started | — | — | | 4 — Staged pipeline | not started | — | — | | 5 — Env planner | not started | — | — | @@ -340,3 +362,24 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency file-content reads (`consumer-fs-load`, 5.8s) → the target for lazy file contents. (Correction: an earlier "39s" figure was aggregate-concurrent self-time, not wall; warm wall is ~13s.) Direction for the next Phase-2 PR intentionally left open. +- 2026-06-23 — Deeper sub-step instrumentation **disproved the "object materialization" conclusion** + (see §4.1). The warm `legacy-load-deps` cost is filesystem I/O in the deps-cache invalidation: + `statFiles` 22.3s + `cacheRead` 10.9s aggregate, while `deserialize` is 9ms. `statFiles` was a + per-component recursive `globby` that followed each component's `node_modules` symlink into the + shared `node_modules` (226k/230k scanned entries), run 313× per command. Shipped the first Phase-2 + perf change: restrict that per-component scan to the node*modules footprint the cache depends on — + it stops at the `node_modules` boundary and only takes the `node_modules`/`@scope` dir mtimes + (`getLastModifiedDirTimestampMs` in `@teambit/toolbox.fs.last-modified`). Result: **warm `bit status` + fs syscalls 74.3k→46.4k (~37%)**, `readFile` traffic unchanged (no regression, checked against the + bootstrap fs-read e2e metric). **Important correctness lesson:** a first cut simply \_ignored* + node*modules (74.3k→44.8k) — but the deps auto-detect resolves imports \_through* node_modules and + reads each direct dep's `package.json` (`name`/`componentId`), so the cache depends on node_modules + content; ignoring it is a regression. The tree builder stops at the package boundary + (`generate-tree-madge.ts` filter), so only the component's `node_modules`/`@scope` dir mtimes are + needed (a dep add/remove/relink bumps them) — not the deep tree. See [[deps-cache-node-modules-invalidation]]. + (A shared command-scoped index was tried and dropped: it added memoization/invalidation complexity + for zero syscall benefit — batching saves no syscalls here — and over-scanned single-component commands.) + Key perf lesson reaffirmed: aggregate self-time ≠ wall — a same-state A/B moved warm wall only ~0.3s + because the cut work was I/O-wait overlapping CPU on the single JS thread (real win on cold/CI/ + networked FS). The warm-wall bottleneck is CPU-bound: `on-load` (9.2s), `dependency-resolution` + (7.2s), `workspace.get` (5s) — next warm-wall target is `on-load` slot-laziness.