exo-explore · adurham · Apr 2, 2026
diff --git a/src/exo/worker/engines/mlx/cache.py b/src/exo/worker/engines/mlx/cache.py
@@ -1,3 +1,4 @@
+import gc
 import os
 from copy import deepcopy
 from typing import TYPE_CHECKING
@@ -266,6 +267,7 @@ def _evict_if_needed(self):
         if len(self.caches) == 0:
             return
 
+        evicted_any = False
         # Evict LRU entries until below threshold
         while (
             len(self.caches) > 0
@@ -278,10 +280,18 @@ def _evict_if_needed(self):
             self._snapshots.pop(lru_index)
             self._media_regions.pop(lru_index)
             self._last_used.pop(lru_index)
+            evicted_any = True
             logger.info(
                 f"KV cache evicted LRU entry ({evicted_tokens} tokens) due to memory usage"
             )
 
+        if evicted_any:
+            # Force Python GC to release array references, then clear Metal buffer cache.
+            # Without this, evicted MLX arrays stay allocated until the next GC cycle,
+            # leaking ~3-4 GB between long-context requests.
+            gc.collect()
+            mx.clear_cache()
+
     def get_memory_used_percentage(self) -> float:
         local_pressure: float = get_memory_used_percentage()