diff --git a/src/toil/batchSystems/abstractGridEngineBatchSystem.py b/src/toil/batchSystems/abstractGridEngineBatchSystem.py index 2d31cb8e84..be2f3ef92e 100644 --- a/src/toil/batchSystems/abstractGridEngineBatchSystem.py +++ b/src/toil/batchSystems/abstractGridEngineBatchSystem.py @@ -146,14 +146,14 @@ def createJobs(self, newJob: JobTuple) -> bool: self.boss.config.max_jobs ): activity = True - jobID, cpu, memory, command, jobName, environment, gpus = ( + jobID, cpu, memory, walltime, command, jobName, environment, gpus = ( self.waitingJobs.pop(0) ) if self.boss.config.memory_is_product and cpu > 1: memory = memory // cpu # prepare job submission command subLine = self.prepareSubmission( - cpu, memory, jobID, command, jobName, environment, gpus + cpu, memory, walltime, jobID, command, jobName, environment, gpus ) logger.debug("Running %r", subLine) batchJobID = self.boss.with_retries(self.submitJob, subLine) @@ -364,6 +364,7 @@ def prepareSubmission( self, cpu: int, memory: int, + walltime: int, jobID: int, command: str, jobName: str, @@ -500,6 +501,7 @@ def issueBatchJob( job_id, job_desc.cores, job_desc.memory, + job_desc.walltime, command, get_job_kind(job_desc.get_names()), job_environment, diff --git a/src/toil/batchSystems/slurm.py b/src/toil/batchSystems/slurm.py index c86a0b771c..19d17e64de 100644 --- a/src/toil/batchSystems/slurm.py +++ b/src/toil/batchSystems/slurm.py @@ -335,6 +335,7 @@ def prepareSubmission( self, cpu: int, memory: int, + walltime: int, jobID: int, command: str, jobName: str, @@ -344,7 +345,7 @@ def prepareSubmission( # Make sure to use exec so we can get Slurm's signals in the Toil # worker instead of having an intervening Bash return self.prepareSbatch( - cpu, memory, jobID, jobName, job_environment, gpus + cpu, memory, walltime, jobID, jobName, job_environment, gpus ) + [f"--wrap=exec {command}"] def submitJob(self, subLine: list[str]) -> int: @@ -839,6 +840,7 @@ def prepareSbatch( self, cpu: int, mem: int, + walltime: int, jobID: int, jobName: str, job_environment: dict[str, str] | None, @@ -889,8 +891,8 @@ def prepareSbatch( # --export=[ALL,] export_all = True - export_list = [] # Some items here may be multiple comma-separated values - time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined] + export_list = [] # Some items here may be multiple comma-separated values + time_limit: int | None = self.boss.config.slurm_time or walltime # type: ignore[attr-defined] partition: str | None = None if nativeConfig is not None: @@ -1041,7 +1043,7 @@ def prepareSbatch( sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}") if cpu is not None: sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}") - if time_limit is not None: + if time_limit and time_limit > 0: # Put all the seconds in the seconds slot sbatch_line.append(f"--time=0:{time_limit}") @@ -1091,6 +1093,7 @@ def issueBatchJob( job_id, job_desc.cores, memory, + job_desc.walltime, command, get_job_kind(job_desc.get_names()), job_environment, diff --git a/src/toil/common.py b/src/toil/common.py index 25639f448b..da10b1cb16 100644 --- a/src/toil/common.py +++ b/src/toil/common.py @@ -204,6 +204,7 @@ class Config: deadlockCheckInterval: float | int # Resource requirements + defaultWalltime: int defaultMemory: int defaultCores: float | int defaultDisk: int @@ -389,6 +390,7 @@ def set_option(option_name: str, old_names: list[str] | None = None) -> None: set_option("deadlockCheckInterval") set_option("defaultMemory") + set_option("defaultWalltime") set_option("defaultCores") set_option("defaultDisk") set_option("defaultAccelerators") diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 726dda7098..68263f8a72 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -2193,6 +2193,7 @@ class CWLNamedJob(Job): def __init__( self, cores: float | None = 1, + walltime: int | None = 0, memory: int | str | None = "1GiB", disk: int | str | None = "1MiB", accelerators: list[AcceleratorRequirement] | None = None, @@ -2238,6 +2239,7 @@ def __init__( # Set up the job with the right requirements and names. super().__init__( cores=cores, + walltime=walltime, memory=memory, disk=disk, accelerators=accelerators, @@ -2574,6 +2576,17 @@ def __init__( # Note: if the job is using the toil default memory, it won't be increased memory = max(memory, min_ram) + # Check if the tool has set a time limit. If yes, use it. Otherwise, + # use a None requirement to use the Toil default. + tool_max_walltime = tool.get_requirement("ToolTimeLimit")[0] or {} + if ( + "timelimit" in tool_max_walltime + and (limit_val := tool_max_walltime["timelimit"]) is not None + ): + walltime = cast(int, self.builder.do_eval(limit_val)) + else: + walltime = None + accelerators: list[AcceleratorRequirement] | None = None if req.get("cudaDeviceCount", 0) > 0: # There's a CUDARequirement, which cwltool processed for us @@ -2639,6 +2652,7 @@ def __init__( super().__init__( cores=req["cores"], memory=memory, + walltime=walltime, disk=int(total_disk), accelerators=accelerators, preemptible=preemptible, diff --git a/src/toil/job.py b/src/toil/job.py index fa42eb8fdc..0174cf0707 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -68,7 +68,7 @@ from toil.deferred import DeferredFunction from toil.fileStores import FileID from toil.lib.compatibility import deprecated -from toil.lib.conversions import bytes2human, human2bytes +from toil.lib.conversions import bytes2human, human2bytes, seconds_to_dhms from toil.lib.exceptions import UnimplementedURLException from toil.lib.expando import Expando from toil.lib.resources import ResourceMonitor @@ -415,13 +415,14 @@ class RequirementsDict(TypedDict): cores: NotRequired[int | float] memory: NotRequired[int] + walltime: NotRequired[int] disk: NotRequired[int] accelerators: NotRequired[list[AcceleratorRequirement]] preemptible: NotRequired[bool] # These must be all the key names in RequirementsDict -REQUIREMENT_NAMES = ["disk", "memory", "cores", "accelerators", "preemptible"] +REQUIREMENT_NAMES = ["disk", "walltime", "memory", "cores", "accelerators", "preemptible"] # This is the supertype of all value types in RequirementsDict ParsedRequirement = Union[int, float, bool, list[AcceleratorRequirement]] @@ -454,7 +455,7 @@ class Requirer: """ Base class implementing the storage and presentation of requirements. - Has cores, memory, disk, and preemptability as properties. + Has cores, walltime, memory, disk, and preemptability as properties. """ _requirementOverrides: RequirementsDict @@ -464,7 +465,7 @@ def __init__(self, requirements: Mapping[str, ParseableRequirement | None]) -> N Parse and save the given requirements. :param dict requirements: Dict from string to value - describing a set of resource requirments. 'cores', 'memory', + describing a set of resource requirments. 'cores', 'walltime', 'memory', 'disk', 'preemptible', and 'accelerators' fields, if set, are parsed and broken out into properties. If unset, the relevant property will be unspecified, and will be pulled from the assigned @@ -545,7 +546,7 @@ def __deepcopy__(self, memo: Any) -> Requirer: @overload @staticmethod def _parseResource( - name: Literal["memory"] | Literal["disk"], + name: Literal["memory"] | Literal["disk"] | Literal["walltime"], value: ParseableIndivisibleResource, ) -> int: ... @@ -610,7 +611,7 @@ def _parseResource( # Anything can be None. return value - if name in ("memory", "disk", "cores"): + if name in ("memory", "disk", "cores", "walltime"): # These should be numbers that accept things like "5G". if isinstance(value, bytes): value = value.decode("utf-8") @@ -722,6 +723,15 @@ def memory(self) -> int: def memory(self, val: ParseableIndivisibleResource) -> None: self._requirementOverrides["memory"] = Requirer._parseResource("memory", val) + @property + def walltime(self) -> int: + """Get the maximum walltime in seconds allowed.""" + return cast(int, self._fetchRequirement("walltime")) + + @walltime.setter + def walltime(self, val: ParseableIndivisibleResource) -> None: + self._requirementOverrides["walltime"] = Requirer._parseResource("walltime", val) + @property def cores(self) -> int | float: """Get the number of CPU cores required.""" @@ -791,7 +801,11 @@ def requirements_string(self) -> str: for k in REQUIREMENT_NAMES: v: str | ParsedRequirement | None = self._fetchRequirement(k) if v is not None: - if isinstance(v, (int, float)) and v > 1000: + if k == "walltime": + if v == 0: + continue + v = seconds_to_dhms(cast(int, v)) + elif isinstance(v, (int, float)) and v > 1000: # Make large numbers readable v = bytes2human(v) parts.append(f"{k}: {v}") @@ -843,7 +857,7 @@ def __init__( :param requirements: Dict from string to number, string, or bool describing the resource requirements of the job. 'cores', 'memory', - 'disk', and 'preemptible' fields, if set, are parsed and broken out + 'disk', 'walltime', and 'preemptible' fields, if set, are parsed and broken out into properties. If unset, the relevant property will be unspecified, and will be pulled from the assigned Config object if queried (see :meth:`toil.job.Requirer.assignConfig`). @@ -1738,6 +1752,7 @@ class Job: def __init__( self, + walltime: ParseableIndivisibleResource | None = None, memory: ParseableIndivisibleResource | None = None, cores: ParseableDivisibleResource | None = None, disk: ParseableIndivisibleResource | None = None, @@ -1756,6 +1771,7 @@ def __init__( This method must be called by any overriding constructor. + :param walltime: the maximum walltime in seconds that the job is allowed to run. :param memory: the maximum number of bytes of memory the job will require to run. :param cores: the number of CPU cores required. :param disk: the amount of local disk space required by the job, expressed in bytes. @@ -1772,6 +1788,7 @@ def __init__( :param local: if the job can be run on the leader. :param files: Set of Files that the job will want to use. + :type walltime: int :type memory: int or string convertible by toil.lib.conversions.human2bytes to an int :type cores: float, int, or string convertible by toil.lib.conversions.human2bytes to an int :type disk: int or string convertible by toil.lib.conversions.human2bytes to an int @@ -1794,6 +1811,7 @@ def __init__( preemptible = preemptable # Build a requirements dict for the description requirements = { + "walltime": walltime, "memory": memory, "cores": cores, "disk": disk, @@ -1906,6 +1924,15 @@ def disk(self) -> int: def disk(self, val: int) -> None: self.description.disk = val + @property + def walltime(self) -> int: + """The maximum walltime in seconds that the job is allowed to run.""" + return self.description.walltime + + @walltime.setter + def walltime(self, val: int) -> None: + self.description.walltime = val + @property def memory(self) -> int: """The maximum number of bytes of memory the job will require to run.""" @@ -2688,6 +2715,7 @@ class Service(Requirer, metaclass=ABCMeta): def __init__( self, + walltime: ParseableIndivisibleResource | None = None, memory: ParseableIndivisibleResource | None = None, cores: ParseableDivisibleResource | None = None, disk: ParseableIndivisibleResource | None = None, @@ -2696,13 +2724,14 @@ def __init__( unitName: str | None = "", ) -> None: """ - Memory, core and disk requirements are specified identically to as in \ + Memory, walltime, core and disk requirements are specified identically to as in \ :func:`toil.job.Job.__init__`. """ # Save the requirements in ourselves so they are visible on `self` to user code. super().__init__( { "memory": memory, + "walltime": walltime, "cores": cores, "disk": disk, "accelerators": accelerators, @@ -3459,7 +3488,7 @@ def __init__( ``**kwargs`` as arguments. The keywords ``memory``, ``cores``, ``disk``, ``accelerators`, - ``preemptible`` and ``checkpoint`` are reserved keyword arguments that + ``preemptible``, ``walltime``, and ``checkpoint`` are reserved keyword arguments that if specified will be used to determine the resources required for the job, as :func:`toil.job.Job.__init__`. If they are keyword arguments to the function they will be extracted from the function definition, but @@ -3494,6 +3523,7 @@ def resolve(key: str, default: Any | None = None, dehumanize: bool = False) -> A super().__init__( memory=resolve("memory", dehumanize=True), + walltime=resolve("walltime"), cores=resolve("cores", dehumanize=True), disk=resolve("disk", dehumanize=True), accelerators=resolve("accelerators"), @@ -3555,6 +3585,7 @@ class JobFunctionWrappingJob(FunctionWrappingJob): can be specified: - memory + - walltime - disk - cores - accelerators @@ -3562,7 +3593,7 @@ class JobFunctionWrappingJob(FunctionWrappingJob): For example to wrap a function into a job we would call:: - Job.wrapJobFn(myJob, memory='100k', disk='1M', cores=0.1) + Job.wrapJobFn(myJob, memory='100k', disk='1M', cores=0.1, walltime=0) """ @@ -3592,6 +3623,7 @@ def __init__(self, userFunction: Callable[..., Any], *args: Any, **kwargs: Any) disk="1M", memory="32M", cores=0.1, + walltime=0, accelerators=[], preemptible=True, preemptable=True, @@ -3693,6 +3725,7 @@ def __init__(self, job: Job | None, unitName: str | None = None) -> None: disk="100M", memory="512M", cores=0.1, + walltime=0, unitName=None if unitName is None else unitName + "-followOn", ) Job.addFollowOn(self, self.encapsulatedFollowOn) diff --git a/src/toil/lib/conversions.py b/src/toil/lib/conversions.py index 50c64e34d4..1827684719 100644 --- a/src/toil/lib/conversions.py +++ b/src/toil/lib/conversions.py @@ -3,6 +3,7 @@ Also contains general conversion functions """ +import datetime import math import urllib.parse from typing import SupportsInt @@ -179,6 +180,19 @@ def hms_duration_to_seconds(hms: str) -> float: return seconds +def seconds_to_dhms(seconds: int) -> str: + """ + Convert seconds to a days-hours:minutes:seconds string. + """ + if seconds < 0: + raise ValueError("Invalid Time, negative value") + + walltime = datetime.timedelta(seconds=seconds) + days = walltime.days + remainder = str(walltime).split(",")[int(days > 0)].strip() + return f"{days}-{remainder}" + + def strtobool(val: str) -> bool: """ Make a human-readable string into a bool. diff --git a/src/toil/options/common.py b/src/toil/options/common.py index 04f0f089da..5417c0dbcd 100644 --- a/src/toil/options/common.py +++ b/src/toil/options/common.py @@ -758,6 +758,7 @@ def __call__( ) cpu_note = "Fractions of a core (for example 0.1) are supported on some batch systems [mesos, single_machine]" disk_mem_note = "Standard suffixes like K, Ki, M, Mi, G or Gi are supported" + disk_walltime_note = "Values are assumed to be in seconds. A value of 0 does not limit the walltime" accelerators_note = ( "Each accelerator specification can have a type (gpu [default], nvidia, amd, cuda, rocm, opencl, " "or a specific model like nvidia-tesla-k80), and a count [default: 1]. If both a type and a count " @@ -777,6 +778,16 @@ def __call__( "default", "memory", disk_mem_note, bytes2human(2147483648) ), ) + resource_options.add_argument( + "--defaultWalltime", + dest="defaultWalltime", + default="0", + type=int, + action=make_open_interval_action(1), + help=resource_help_msg.format( + "default", "walltime", disk_walltime_note, str(0) + ), + ) resource_options.add_argument( "--defaultCores", dest="defaultCores", diff --git a/src/toil/test/batchSystems/batchSystemTest.py b/src/toil/test/batchSystems/batchSystemTest.py index f9d3a168bd..f38d2aff02 100644 --- a/src/toil/test/batchSystems/batchSystemTest.py +++ b/src/toil/test/batchSystems/batchSystemTest.py @@ -86,7 +86,7 @@ # Since we aren't always attaching the config to the jobs for these tests, we # need to use fully specified requirements. defaultRequirements = dict( - memory=int(100e6), cores=1, disk=1000, preemptible=preemptible, accelerators=[] + memory=int(100e6), cores=1, disk=1000, preemptible=preemptible, accelerators=[], walltime=0 ) @@ -918,7 +918,6 @@ def testHidingProcessEscape(self) -> None: Test to make sure that child processes and their descendants go away when the Toil workflow stops, even if the job process stops and leaves children. """ - self.testProcessEscape(hide=True) @@ -1026,6 +1025,7 @@ def test(self) -> None: requirements=dict( cores=float(coresPerJob), memory=1, + walltime=0, disk=1, accelerators=[], preemptible=preemptible, diff --git a/src/toil/test/batchSystems/test_slurm.py b/src/toil/test/batchSystems/test_slurm.py index 9f990a6668..7e2ff5bc14 100644 --- a/src/toil/test/batchSystems/test_slurm.py +++ b/src/toil/test/batchSystems/test_slurm.py @@ -626,14 +626,14 @@ def test_prepareSbatch_partition(self): # Without a partition override in the environment, we should get the # "short" partition for this job - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=short" in command # With a partition override, we should not. But the override will be rewritten. self.worker.boss.config.slurm_args = ( "--something --partition foo --somethingElse" ) - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=short" not in command assert "--partition=foo" in command @@ -641,27 +641,27 @@ def test_prepareSbatch_partition(self): self.worker.boss.config.slurm_args = ( "--something --partition=foo --somethingElse" ) - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=short" not in command assert "--partition=foo" in command # And short options self.worker.boss.config.slurm_args = "--something -p foo --somethingElse" - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=short" not in command assert "--partition=foo" in command # Partition settings from the config should override automatic selection self.worker.boss.config.slurm_partition = "foobar" self.worker.boss.config.slurm_args = "--something --somethingElse" - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=foobar" in command # But they should be overridden by the argument overrides self.worker.boss.config.slurm_args = ( "--something --partition=baz --somethingElse" ) - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--partition=baz" in command def test_prepareSbatch_time(self): @@ -673,7 +673,7 @@ def test_prepareSbatch_time(self): # Without a time override in the environment, we should use the normal # time and the "short" partition - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) logger.debug("Command: %s", command) assert "--time=0:30" in command assert "--partition=short" in command @@ -683,7 +683,7 @@ def test_prepareSbatch_time(self): self.worker.boss.config.slurm_args = ( "--something --time 10:00:00 --somethingElse" ) - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) logger.debug("Command: %s", command) assert "--partition=medium" in command assert "--time=0:36000" in command @@ -692,14 +692,14 @@ def test_prepareSbatch_time(self): self.worker.boss.config.slurm_args = ( "--something --time=10:00:00 --somethingElse" ) - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) logger.debug("Command: %s", command) assert "--partition=medium" in command assert "--time=0:36000" in command # And short options self.worker.boss.config.slurm_args = "--something -t 10:00:00 --somethingElse" - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) logger.debug("Command: %s", command) assert "--partition=medium" in command assert "--time=0:36000" in command @@ -710,17 +710,17 @@ def test_prepareSbatch_export(self): self.worker.boss.partitions = ps # Without any overrides, we need --export=ALL - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--export=ALL" in command # With overrides, we don't get --export=ALL self.worker.boss.config.slurm_args = "--export=foo" - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--export=ALL" not in command # With --export-file, we don't get --export=ALL as documented. self.worker.boss.config.slurm_args = "--export-file=./thefile.txt" - command = self.worker.prepareSbatch(1, 100, 5, "job5", None, None) + command = self.worker.prepareSbatch(1, 100, 5, 0, "job5", None, None) assert "--export=ALL" not in command def test_option_detector(self): diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index d440374fdf..dee07b4600 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -1885,6 +1885,22 @@ def test_workflow_echo_string_scatter_capture_stdout(tmp_path: Path) -> None: assert p.returncode == 0 +@needs_cwl +@pytest.mark.cwl +@pytest.mark.cwl_small +def test_timelimit_expression(tmp_path: Path) -> None: + with get_data("test/cwl/timelimit.cwl") as cwl_file: + cmd = [ + "toil-cwl-runner", + f"--jobStore=file:{tmp_path / 'jobStore'}", + str(cwl_file), + ] + p = subprocess.run(cmd, capture_output=True, text=True) + assert len(p.stdout) > 0 + assert "Finished toil run successfully" in p.stderr + assert p.returncode == 0 + + @needs_cwl @pytest.mark.cwl @pytest.mark.cwl_small diff --git a/src/toil/test/cwl/timelimit.cwl b/src/toil/test/cwl/timelimit.cwl new file mode 100644 index 0000000000..f73a047780 --- /dev/null +++ b/src/toil/test/cwl/timelimit.cwl @@ -0,0 +1,11 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: [] +outputs: [] +requirements: + InlineJavascriptRequirement: {} + ToolTimeLimit: + timelimit: $(3*4) + WorkReuse: + enableReuse: false +baseCommand: [sleep, "3"]