diff --git a/docs/appendices/environment_vars.rst b/docs/appendices/environment_vars.rst index d0c9d7dc25..e7c17ed9f2 100644 --- a/docs/appendices/environment_vars.rst +++ b/docs/appendices/environment_vars.rst @@ -156,9 +156,14 @@ There are several environment variables that affect the way Toil runs. +--------------------------------------+-----------------------------------------------------+ | TOIL_SLURM_PARTITION | Partition to send Slurm jobs to. | +--------------------------------------+-----------------------------------------------------+ +| TOIL_SLURM_QOS | Quality Of Service to request for Slurm jobs. | ++--------------------------------------+-----------------------------------------------------+ | TOIL_SLURM_GPU_PARTITION | Partition to send Slurm jobs to if they ask for | | | GPUs. | +--------------------------------------+-----------------------------------------------------+ +| TOIL_SLURM_GPU_QOS | Quality Of Service to request for Slurm jobs if | +| | they ask for GPUs. | ++--------------------------------------+-----------------------------------------------------+ | TOIL_SLURM_PE | Name of the slurm partition to use for parallel | | | jobs. Useful for Slurm clusters that do not offer | | | a partition accepting both single-core and | diff --git a/docs/running/cliOptions.rst b/docs/running/cliOptions.rst index 847e81d614..60d3d6b909 100644 --- a/docs/running/cliOptions.rst +++ b/docs/running/cliOptions.rst @@ -252,8 +252,12 @@ levels in toil are based on priority from the logging module: Slurm job time limit, in [DD-]HH:MM:SS format. --slurmPartition SLURM_PARTITION Partition to send Slurm jobs to. + --slurmQOS SLURM_QOS + Quality Of Service to request for Slurm jobs. --slurmGPUPartition SLURM_GPU_PARTITION Partition to send Slurm jobs to if they ask for GPUs. + --slurmGPUQOS SLURM_GPU_QOS + Quality Of Service to request for Slurm jobs if they ask for GPUs. --slurmPE SLURM_PE Special partition to send Slurm jobs to if they ask for more than 1 CPU. Useful for Slurm clusters that do not offer a partition accepting both single-core and diff --git a/src/toil/batchSystems/slurm.py b/src/toil/batchSystems/slurm.py index c86a0b771c..32213940fe 100644 --- a/src/toil/batchSystems/slurm.py +++ b/src/toil/batchSystems/slurm.py @@ -157,6 +157,23 @@ def is_match(option: str) -> bool: return is_match +def parse_slurm_option_value(args: list[str], i: int) -> tuple[str, int]: + """ + Parse a Slurm option value in either ``--opt=value`` or ``--opt value`` form. + + Returns the extracted value and the last consumed index. + + :param args: list of command-line arguments + :param i: index of the option argument in the list to parse + """ + arg = args[i] + if "=" not in arg: + if i + 1 >= len(args): + raise ValueError(f"No value supplied for Slurm {arg} argument") + return args[i + 1], i + 1 + return arg.split("=", 1)[1], i + + class SlurmBatchSystem(AbstractGridEngineBatchSystem): class PartitionInfo(NamedTuple): partition_name: str @@ -883,6 +900,7 @@ def prepareSbatch( is_export_file_option = option_detector("export-file") is_time_option = option_detector("time", "t") is_partition_option = option_detector("partition", "p") + is_qos_option = option_detector("qos") # We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or # with our own determinations if they aren't there. @@ -892,6 +910,7 @@ def prepareSbatch( export_list = [] # Some items here may be multiple comma-separated values time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined] partition: str | None = None + qos: str | None = None if nativeConfig is not None: logger.debug( @@ -913,15 +932,8 @@ def prepareSbatch( elif is_export_option(arg): # Capture the export argument value so we can modify it export_all = False - if "=" not in arg: - if i + 1 >= len(args): - raise ValueError( - f"No value supplied for Slurm {arg} argument" - ) - i += 1 - export_list.append(args[i]) - else: - export_list.append(arg.split("=", 1)[1]) + export_value, i = parse_slurm_option_value(args, i) + export_list.append(export_value) elif is_export_file_option(arg): # Keep --export-file but turn off --export=ALL in that # case. @@ -929,27 +941,14 @@ def prepareSbatch( sbatch_line.append(arg) elif is_time_option(arg): # Capture the time limit in seconds so we can use it for picking a partition - if "=" not in arg: - if i + 1 >= len(args): - raise ValueError( - f"No value supplied for Slurm {arg} argument" - ) - i += 1 - time_string = args[i] - else: - time_string = arg.split("=", 1)[1] + time_string, i = parse_slurm_option_value(args, i) time_limit = parse_slurm_time(time_string) elif is_partition_option(arg): # Capture the partition so we can run checks on it and know not to assign one - if "=" not in arg: - if i + 1 >= len(args): - raise ValueError( - f"No value supplied for Slurm {arg} argument" - ) - i += 1 - partition = args[i] - else: - partition = arg.split("=", 1)[1] + partition, i = parse_slurm_option_value(args, i) + elif is_qos_option(arg): + # Capture the QOS so we can avoid assigning one on top of it + qos, i = parse_slurm_option_value(args, i) else: # Other arguments pass through. sbatch_line.append(arg) @@ -1013,12 +1012,23 @@ def prepareSbatch( # Pick a partition based on time limit partition = self.boss.partitions.get_partition(time_limit) + if qos is None: + # Apply a configured QOS if one wasn't already supplied. + gpu_qos_override: str | None = self.boss.config.slurm_gpu_qos # type: ignore[attr-defined] + qos_override: str | None = self.boss.config.slurm_qos # type: ignore[attr-defined] + if gpus and gpu_qos_override: + qos = gpu_qos_override + elif qos_override: + qos = qos_override + # Now generate all the arguments if len(export_list) > 0: # add --export to the sbatch sbatch_line.append("--export=" + ",".join(export_list)) if partition is not None: sbatch_line.append(f"--partition={partition}") + if qos is not None: + sbatch_line.append(f"--qos={qos}") if gpus: # Generate GPU assignment argument sbatch_line.append(f"--gres=gpu:{gpus}") @@ -1178,6 +1188,13 @@ def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None: env_var="TOIL_SLURM_PARTITION", help="Partition to send Slurm jobs to.", ) + parser.add_argument( + "--slurmQOS", + dest="slurm_qos", + default=None, + env_var="TOIL_SLURM_QOS", + help="Quality Of Service to request for Slurm jobs.", + ) parser.add_argument( "--slurmGPUPartition", dest="slurm_gpu_partition", @@ -1185,6 +1202,13 @@ def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None: env_var="TOIL_SLURM_GPU_PARTITION", help="Partition to send Slurm jobs to if they ask for GPUs.", ) + parser.add_argument( + "--slurmGPUQOS", + dest="slurm_gpu_qos", + default=None, + env_var="TOIL_SLURM_GPU_QOS", + help="Quality Of Service to request for Slurm jobs if they ask for GPUs.", + ) parser.add_argument( "--slurmPE", dest="slurm_pe", @@ -1208,6 +1232,8 @@ def setOptions(cls, setOption: OptionSetter) -> None: setOption("slurm_default_all_mem") setOption("slurm_time") setOption("slurm_partition") + setOption("slurm_qos") setOption("slurm_gpu_partition") + setOption("slurm_gpu_qos") setOption("slurm_pe") setOption("slurm_args") diff --git a/src/toil/test/batchSystems/test_slurm.py b/src/toil/test/batchSystems/test_slurm.py index 9f990a6668..081233e4c9 100644 --- a/src/toil/test/batchSystems/test_slurm.py +++ b/src/toil/test/batchSystems/test_slurm.py @@ -751,6 +751,19 @@ def test_any_option_detector(self): self.assertFalse(detector("--no-bazz")) self.assertFalse(detector("--foo-bar=--bazz-only")) + def test_parse_slurm_option_value(self): + args = ["--qos", "high", "--partition=gpu"] + value, index = toil.batchSystems.slurm.parse_slurm_option_value(args, 0) + self.assertEqual(value, "high") + self.assertEqual(index, 1) + + value, index = toil.batchSystems.slurm.parse_slurm_option_value(args, 2) + self.assertEqual(value, "gpu") + self.assertEqual(index, 2) + + with self.assertRaises(ValueError): + toil.batchSystems.slurm.parse_slurm_option_value(["--qos"], 0) + def test_sinfo_not_permitted(self): """Test when ``sinfo`` is not available.