DataBiosphere · adamnovak · May 7, 2026 · Apr 30, 2026 · May 5, 2026 · May 7, 2026
diff --git a/src/toil/lib/threading.py b/src/toil/lib/threading.py
@@ -141,7 +141,7 @@ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
             if e.errno in (errno.EACCES, errno.EAGAIN):
                 # Nonblocking lock not available.
                 raise
-            elif e.errno == errno.EIO:
+            elif e.errno in (errno.EIO, errno.ENOLCK):
                 # Sometimes Ceph produces IO errors when talking to lock files.
                 # Back off and try again.
                 # TODO: Should we eventually give up if the disk really is
@@ -156,9 +156,17 @@ def safe_lock(fd: int, block: bool = True, shared: bool = False) -> None:
                     error_tries += 1
                     continue
                 else:
-                    logger.critical(
-                        "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
-                    )
+                    if e.errno == errno.ENOLCK:
+                        logger.critical(
+                            "No locks available after %d retries. The filesystem at the "
+                            "lock path may not support POSIX file locking (common on some "
+                            "NFS setups). Use --coordinationDir to point at local storage.",
+                            MAX_ERROR_TRIES,
+                        )
+                    else:
+                        logger.critical(
+                            "Too many IO errors talking to lock file. If using Ceph, check for MDS deadlocks. See <https://tracker.ceph.com/issues/62123>."
+                        )
                     raise
             else:
                 raise
@@ -171,7 +179,7 @@ def safe_unlock_and_close(fd: int) -> None:
     try:
         fcntl.flock(fd, fcntl.LOCK_UN)
     except OSError as e:
-        if e.errno != errno.EIO:
+        if e.errno not in (errno.EIO, errno.ENOLCK):
             raise
         # Sometimes Ceph produces EIO. We don't need to retry then because
         # we're going to close the FD and after that the file can't remain

diff --git a/src/toil/test/src/threadingTest.py b/src/toil/test/src/threadingTest.py
@@ -6,8 +6,10 @@
 import traceback
 from functools import partial
 from pathlib import Path
+import errno
 
-from toil.lib.threading import LastProcessStandingArena, cpu_count, global_mutex
+from toil.lib.threading import LastProcessStandingArena, cpu_count, global_mutex, safe_lock, safe_unlock_and_close
+from unittest.mock import patch
 
 log = logging.getLogger(__name__)
 
@@ -68,6 +70,59 @@ def testLastProcessStanding(self, tmp_path: Path) -> None:
                 assert not filename.startswith(
                     "precious"
                 ), f"File {filename} still exists"
+
+    # Tests for ENOLCK (toil#4846)
+    def testSafeLockRetriesOnENOLCK(self) -> None:
+        enolck = OSError(errno.ENOLCK, "No locks available")
+        # First call raises ENOLCK, second call succeeds
+        with patch("fcntl.flock", side_effect=[enolck, None]) as mock_flock:
+            safe_lock(0)
+            assert mock_flock.call_count == 2
+
+    def testSafeLockFailsAfterMaxRetriesOnENOLCK(self) -> None:
+        enolck = OSError(errno.ENOLCK, "No locks available")
+        # First call raises ENOLCK, second call succeeds
+        with patch("fcntl.flock", side_effect=enolck):
+            with patch("toil.lib.threading.time.sleep"):  # skip the backoff waits
+                try:
+                    safe_lock(0)
+                    assert False, "Expected OSError to be raised"
+                except OSError as e:
+                    assert e.errno == errno.ENOLCK
+
+    def testSafeLockRetriesOnEIO(self) -> None:
+        eio = OSError(errno.EIO, "Input/Output Error")
+        # First call raises EIO, second call succeeds
+        with patch("fcntl.flock", side_effect=[eio, None]) as mock_flock:
+            safe_lock(0)
+            assert mock_flock.call_count == 2
+
+    def testSafeLockFailsAfterMaxRetriesOnEIO(self) -> None:
+        eio = OSError(errno.EIO, "Input/Output Error")
+        # First call raises EIO, second call succeeds
+        with patch("fcntl.flock", side_effect=eio):
+            with patch("toil.lib.threading.time.sleep"):  # skip the backoff waits
+                try:
+                    safe_lock(0)
+                    assert False, "Expected OSError to be raised"
+                except OSError as e:
+                    assert e.errno == errno.EIO
+
+    def testSafeUnlockAndCloseSwallowsENOLCK(self) -> None:
+        enolck = OSError(errno.ENOLCK, "No locks available")
+        # First call raises ENOLCK, second call succeeds
+        with patch("fcntl.flock", side_effect=enolck):
+            with patch("os.close") as mock_close:
+                safe_unlock_and_close(0)
+                mock_close.assert_called_once_with(0)
+
+    def testSafeUnlockAndCloseSwallowsEIO(self) -> None:
+        # First call raises EIO, second call succeeds
+        eio = OSError(errno.EIO, "Input/output error")
+        with patch("fcntl.flock", side_effect=eio):
+            with patch("os.close") as mock_close:
+                safe_unlock_and_close(0)
+                mock_close.assert_called_once_with(0)
 
 
 def _testGlobalMutexOrderingTask(scope: Path, mutex: str, number: int) -> bool: