diff --git a/scopes/toolbox/fs/last-modified/last-modified.ts b/scopes/toolbox/fs/last-modified/last-modified.ts
index 13a72d81fbea..f88ea7afba67 100644
--- a/scopes/toolbox/fs/last-modified/last-modified.ts
+++ b/scopes/toolbox/fs/last-modified/last-modified.ts
@@ -4,16 +4,31 @@ import fs from 'fs-extra';
 import { compact } from 'lodash';
 
 /**
- * check recursively all the sub-directories as well
+ * last-modified mtime of a component's directory structure (recursively), used as the dependency
+ * fs-cache freshness signal.
+ *
+ * the source tree is scanned recursively, but the scan stops at the `node_modules` boundary: the
+ * dependency auto-detect (whose cached result this guards) never traverses *into* a package, so the
+ * package internals and the transitive store are irrelevant — recursing `node_modules` followed the
+ * symlinked store and made this scan ~60x larger. Instead only the `node_modules` dir and its
+ * `@scope` dirs' mtimes are taken: a direct dep added / removed / version-relinked / componentId-
+ * changed all go through a relink that rewrites the symlink entry, bumping the containing dir.
  */
 async function getLastModifiedDirTimestampMs(rootDir: string): Promise<number> {
-  const allDirs = await globby(rootDir, {
+  // source subdirectories, excluding the deep node_modules subtree.
+  const sourceDirs = await globby(rootDir, {
     onlyDirectories: true,
-    // ignore: ['**/node_modules/**'], // need to think about it more. sometimes we do want to invalidate cache upon node_modules changes inside component dir
-    // stats: true // todo: consider retrieving the stats from here.
+    ignore: ['**/node_modules/**'],
   });
-  allDirs.push(rootDir);
-  return getLastModifiedPathsTimestampMs(allDirs);
+  sourceDirs.push(rootDir);
+  // node_modules/@scope dirs (catch a scoped dep changing within an already-existing scope). a *bare*
+  // `node_modules` glob would recurse the whole symlinked tree, so its own dir mtime is taken via the
+  // direct stat in getLastModifiedPathsTimestampMs below, not globbed.
+  const scopeDirs = await globby(`${rootDir}/node_modules/@*`, {
+    onlyDirectories: true,
+    followSymbolicLinks: false,
+  });
+  return getLastModifiedPathsTimestampMs([...sourceDirs, ...scopeDirs, `${rootDir}/node_modules`]);
 }
 
 export async function getLastModifiedPathsTimestampMs(paths: string[]): Promise<number> {
diff --git a/scopes/workspace/workspace/component-loading-redesign.md b/scopes/workspace/workspace/component-loading-redesign.md
index 4391f3347118..08e032fff8fb 100644
--- a/scopes/workspace/workspace/component-loading-redesign.md
+++ b/scopes/workspace/workspace/component-loading-redesign.md
@@ -1,7 +1,7 @@
 # Component Loading Redesign
 
 **Status:** Phase 1 shipped; Phase 2 in progress
-**Last updated:** 2026-06-15 (code references are against `master` @ `59855b104`; line numbers will drift)
+**Last updated:** 2026-06-23 (code references are against `master`; line numbers will drift)
 
 This document is the source of truth for a multi-phase effort to simplify Bit's component-loading
 mechanism: fewer caches, a staged (lazy) loading pipeline, a single env/aspect load planner, and a
@@ -204,11 +204,18 @@ earlier ones teach us).
 ### Phase 2 — Quick perf wins on existing seams
 
 - [x] Benchmark harness committed + baseline recorded (see §4) — **gate for the rest of the phase**
-- [ ] Lazy file contents in `ModelComponent.toConsumerComponent`
+- [x] Restrict the deps-cache invalidation scan to the node*modules footprint the cache depends on:
+      the per-component freshness scan stops at the `node_modules` boundary (auto-detect never traverses
+      \_into* a package) and only takes each component's `node_modules`/`@scope` dir mtimes — a direct
+      dep added/removed/relinked bumps them. Cuts warm `bit status` fs syscalls ~37% (74.3k→46.4k);
+      warm-wall-neutral (I/O-wait), helps cold/CI (see §4.1). Kept per-component (no shared index — it
+      added complexity for zero syscall benefit and over-scanned single-component commands).
+- [ ] `on-load` slot-laziness (`loadDocs: false, loadCompositions: false` for non-UI flows) — the
+      largest CPU-bound stage (9.2s) and the next **warm-wall** target
+- [ ] Lazy file contents in `ModelComponent.toConsumerComponent` (helps `graph`, not `status`)
 - [ ] `bit deps usage`: ids + stored deps instead of full load
 - [ ] IDE metadata endpoint (`api-for-ide.ts`): S0-S2-level data only
 - [ ] `bit remove` / forking: drop full-component loads where only ids/paths are used
-- [ ] Default `loadDocs: false, loadCompositions: false` for non-UI flows
 
 ### Phase 3 — Cache consolidation
 
@@ -270,13 +277,13 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency
 
 **`bit status` (warm, ~13s wall):**
 
-| stage                                                       | aggregate self-time | ~wall | note                                          |
-| ----------------------------------------------------------- | ------------------- | ----- | --------------------------------------------- |
-| `legacy-load-deps`                                          | 43s                 | ~7s   | dependency-object materialization (see below) |
-| `on-load` (slot handlers: docs, compositions, schema, pkg…) | 10s                 | ~1.7s | trimmable for non-UI flows                    |
-| `dependency-resolution` (Harmony resolver)                  | 7.7s                | ~1.3s |                                               |
-| `execute-load-slot` (own)                                   | 5.8s                | ~1s   |                                               |
-| `consumer-fs-load` (file content reads)                     | negligible          | —     | not a `status` cost                           |
+| stage                                                       | aggregate self-time | ~wall | note                                                        |
+| ----------------------------------------------------------- | ------------------- | ----- | ----------------------------------------------------------- |
+| `legacy-load-deps`                                          | 43s                 | ~7s   | filesystem traversal in deps-cache invalidation (see below) |
+| `on-load` (slot handlers: docs, compositions, schema, pkg…) | 10s                 | ~1.7s | trimmable for non-UI flows                                  |
+| `dependency-resolution` (Harmony resolver)                  | 7.7s                | ~1.3s |                                                             |
+| `execute-load-slot` (own)                                   | 5.8s                | ~1s   |                                                             |
+| `consumer-fs-load` (file content reads)                     | negligible          | —     | not a `status` cost                                         |
 
 **`bit graph --json` (warm, ~20s wall; ~7.5s of it is loading):**
 
@@ -285,23 +292,38 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency
 | `consumer-fs-load` (file content reads) | 5.8s                | dominant load cost for graph   |
 | `legacy-load-deps`                      | 1.2s                | graph uses a lighter load path |
 
-**Key conclusions (validated, not hypothesized):**
+**Key conclusions (validated by measurement; these _supersede_ the earlier "object materialization"
+hypothesis, which deeper sub-step instrumentation disproved):**
 
 - **The dependency FS cache works.** On a warm `bit status`, dep loading is **635 cache hits, 0
   misses/recomputes**. `legacy-load-deps` is _not_ re-resolving dependencies.
-- **`status`'s dominant cost (~7s wall) is dependency-object _materialization on cache hit_** —
-  `DependenciesData.deserialize` + reconstructing full `Dependency`/`DependencyList` objects +
-  `applyOverrides`, for every component, even though `status` reads little of it. This is the
-  "all-or-nothing, always fully materialize" problem of §1.1 — **structural, not a cache bug.**
-  Reducing it needs the staged/lazy-loading work (defer dependency-object construction), **not a
-  Phase-2 quick fix.** Earlier framing of this as "39s" was aggregate-concurrent self-time, not
-  wall; wall is ~13s.
-- **`graph`'s dominant load cost is file-content reads (`consumer-fs-load`, 5.8s)** → this is what
-  **lazy file contents** (§2.1) targets; validated as a real win for `graph`/scope-side loads, but
-  it does **not** help `status` (whose file reads are negligible).
-- Implication for Phase-2 ordering: lazy file contents helps `graph`; per-command partial loads help
-  `deps usage`/IDE/`remove`/forking; `loadDocs/loadCompositions: false` trims `status`'s slot work
-  (~1.7s). The big `status` number is deferred to the staged-loading phase.
+- **`legacy-load-deps` is filesystem I/O, not object materialization.** Sub-step timing of the warm
+  cache-hit path (313 components, aggregate self-time): `statFiles` **22.3s** + `cacheRead` **10.9s**,
+  versus `deserialize` **9ms**, `applyOverrides` 0.4s, `updateVersions` 0.1s. The cost is the
+  deps-cache _read + invalidation_ layer paid per component — **not** `DependenciesData.deserialize` /
+  `Dependency` reconstruction (negligible). `statFiles` was a recursive `globby` per component that
+  **followed the component's `node_modules` symlink into the shared workspace `node_modules`** (226k
+  of 230k scanned entries), run 313× per command.
+- **Aggregate self-time ≠ wall — sharply.** Restricting the node*modules traversal to the footprint
+  the cache depends on (the dependency auto-detect never traverses \_into* a package, so only the
+  component's `node_modules`/`@scope` dir mtimes matter — a dep add/remove/relink bumps them; the deep
+  tree and transitive store are irrelevant) cut **fs syscalls 74.3k→46.4k per warm `bit status`
+  (~37%)** — yet a same-state wall A/B moved warm wall by **~0.3s**. The removed work is I/O-\_wait\* that overlaps with CPU on the single JS thread; on
+  a warm SSD it was never on the critical path (real win on cold/CI/networked filesystems, where it
+  is). `cacheRead` (10.9s) is likewise I/O-wait, so consolidating it would also be warm-wall-neutral.
+  (An earlier iteration _ignored_ node_modules entirely — faster still, but a correctness regression:
+  the cache resolves imports through node_modules, so its structure must invalidate the cache.)
+- **The warm-wall bottleneck is CPU-bound, single-threaded JS** — the stages whose self-time ≈ their
+  wall contribution: **`on-load` slot handlers (9.2s)**, **`dependency-resolution` (7.2s)**,
+  **`workspace.get` (5s)**. These — not the deps-cache I/O — are what move warm `status` wall.
+- **`graph`'s dominant load cost is file-content reads (`consumer-fs-load`, 5.8s)** → the target for
+  **lazy file contents** (§2.1); a real win for `graph`/scope-side loads, but it does **not** help
+  `status` (whose file reads are negligible).
+- Implication for Phase-2 ordering: the deps-cache invalidation batch ships as a standalone fs/CPU
+  efficiency win (helps cold/CI, warm-wall-neutral). The next **warm-wall** target is `on-load`
+  slot-laziness (`loadDocs`/`loadCompositions: false` for non-UI flows) — the largest CPU-bound stage.
+  Earlier framing of the deps cost as "39s"/"materialization" was both an aggregate-vs-wall and a
+  cause misread; corrected here.
 
 ---
 
@@ -310,7 +332,7 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency
 | Phase                   | State       | OpenSpec change                | PRs                                                 |
 | ----------------------- | ----------- | ------------------------------ | --------------------------------------------------- |
 | 1 — Observability       | done        | `component-load-observability` | [#10418](https://github.com/teambit/bit/pull/10418) |
-| 2 — Quick perf wins     | in progress | —                              | —                                                   |
+| 2 — Quick perf wins     | in progress | —                              | [#10445](https://github.com/teambit/bit/pull/10445) |
 | 3 — Cache consolidation | not started | —                              | —                                                   |
 | 4 — Staged pipeline     | not started | —                              | —                                                   |
 | 5 — Env planner         | not started | —                              | —                                                   |
@@ -340,3 +362,24 @@ Numbers below are aggregate self-time across ~313 components at ~6× concurrency
   file-content reads (`consumer-fs-load`, 5.8s) → the target for lazy file contents. (Correction: an
   earlier "39s" figure was aggregate-concurrent self-time, not wall; warm wall is ~13s.) Direction
   for the next Phase-2 PR intentionally left open.
+- 2026-06-23 — Deeper sub-step instrumentation **disproved the "object materialization" conclusion**
+  (see §4.1). The warm `legacy-load-deps` cost is filesystem I/O in the deps-cache invalidation:
+  `statFiles` 22.3s + `cacheRead` 10.9s aggregate, while `deserialize` is 9ms. `statFiles` was a
+  per-component recursive `globby` that followed each component's `node_modules` symlink into the
+  shared `node_modules` (226k/230k scanned entries), run 313× per command. Shipped the first Phase-2
+  perf change: restrict that per-component scan to the node*modules footprint the cache depends on —
+  it stops at the `node_modules` boundary and only takes the `node_modules`/`@scope` dir mtimes
+  (`getLastModifiedDirTimestampMs` in `@teambit/toolbox.fs.last-modified`). Result: **warm `bit status`
+  fs syscalls 74.3k→46.4k (~37%)**, `readFile` traffic unchanged (no regression, checked against the
+  bootstrap fs-read e2e metric). **Important correctness lesson:** a first cut simply \_ignored*
+  node*modules (74.3k→44.8k) — but the deps auto-detect resolves imports \_through* node_modules and
+  reads each direct dep's `package.json` (`name`/`componentId`), so the cache depends on node_modules
+  content; ignoring it is a regression. The tree builder stops at the package boundary
+  (`generate-tree-madge.ts` filter), so only the component's `node_modules`/`@scope` dir mtimes are
+  needed (a dep add/remove/relink bumps them) — not the deep tree. See [[deps-cache-node-modules-invalidation]].
+  (A shared command-scoped index was tried and dropped: it added memoization/invalidation complexity
+  for zero syscall benefit — batching saves no syscalls here — and over-scanned single-component commands.)
+  Key perf lesson reaffirmed: aggregate self-time ≠ wall — a same-state A/B moved warm wall only ~0.3s
+  because the cut work was I/O-wait overlapping CPU on the single JS thread (real win on cold/CI/
+  networked FS). The warm-wall bottleneck is CPU-bound: `on-load` (9.2s), `dependency-resolution`
+  (7.2s), `workspace.get` (5s) — next warm-wall target is `on-load` slot-laziness.