From dba6323b36ca95a3b289dc8018a087e240b9a175 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Wed, 27 May 2026 13:42:46 +0200 Subject: [PATCH 01/10] feat: add classes for vcf, tmb trace and nirvana json --- src/tsoppy/general/input_classes.py | 130 ++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 src/tsoppy/general/input_classes.py diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py new file mode 100644 index 0000000..0469fde --- /dev/null +++ b/src/tsoppy/general/input_classes.py @@ -0,0 +1,130 @@ +from pathlib import Path +from typing import Dict, Optional +import cyvcf2 +import msgspec +import os +import polars + + +class BaseInput: + """Base class for inputs produced by different workflows (e.g. dragen/localapp). + + Subclasses should define `default_subpath_formats` mapping workflow names to + subpath format strings that accept `(sample, sample)` for formatting. + + Attributes: + paths: Mapping of workflow names to resolved Path objects (Dict[str, Path]) + root: Root path (Path) + sample: Sample name (str) + subpath_formats: Mapping of workflow names to subpath format strings (Dict[str, str]) + type: Detected workflow type (str) + """ + + subpath_formats: Dict[str, str] = {} + type: Optional[str] = None + path: Optional[Path] = None + + def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + """Initialize the BaseInput.""" + self.sample = sample + self.root = Path(root_path) + if subpath_formats: + self.subpath_formats.update(subpath_formats) + + self._resolve_paths() + self._detect_type() + + def _resolve_paths(self): + """Resolve the paths for each workflow type based on the provided subpath formats.""" + out: Dict[str, Path] = {} + for name, fmt in self.subpath_formats.items(): + out[name] = Path(os.path.join( + self.root, fmt.format(self.sample, self.sample))) + self.paths = out + + def _detect_type(self): + """Detect which workflow type is present based on the existence of the resolved paths.""" + found = [name for name, path in self.paths.items() if path.is_file()] + if len(found) > 1: + raise ValueError( + f"Multiple workflow files found for sample {self.sample}: {found}") + if not found: + raise FileNotFoundError( + f"No workflow file found for sample {self.sample}. Searched: {self.paths}") + self.type = found[0] + self.path = self.paths[self.type] + + +class Vcf(BaseInput): + """Input class for VCF files produced by different workflows. + + Attributes: + default_subpath_formats: Mapping of workflow names to subpath format strings (Dict[str, str]) + vcf: Parsed VCF object (cyvcf2.VCF) + """ + + default_subpath_formats = { + "dragen": "Logs_Intermediates/DnaDragenCaller/{}/{}.hard-filtered.gvcf.gz", + "localapp": "Logs_Intermediates/VariantMatching/{}/{}_MergedSmallVariants.genome.vcf", + } + + def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + if subpath_formats: + super().__init__(sample, root_path, subpath_formats) + else: + super().__init__(sample, root_path, self.default_subpath_formats) + + def parse(self): + """Parse the VCF file""" + self.vcf = cyvcf2.VCF(self.path) + return self.vcf + + +class TmbTrace(BaseInput): + """Input class for TMB trace files produced by different workflows. + + Attributes: + rows: Parsed rows of the TMB trace file (polars.DataFrame) + """ + + default_subpath_formats = { + "dragen": "Logs_Intermediates/Tmb/{}/{}.tmb.trace.tsv", + "localapp": "Logs_Intermediates/Tmb/{}/{}_TMB_Trace.tsv", + } + + def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + if subpath_formats: + super().__init__(sample, root_path, subpath_formats) + else: + super().__init__(sample, root_path, self.default_subpath_formats) + + def parse(self): + """Parse the TMB trace file""" + self.table = polars.read_csv( + self.path, separator="\t") + return self.table + + +class AnnotatedJson(BaseInput): + """Input class for annotated JSON files produced by different workflows. + + Attributes: + data: Parsed JSON data (dict) + """ + + default_subpath_formats = { + "dragen": "Logs_Intermediates/Annotation/{}/{}_DNAVariants_Annotated.json", + "localapp": "Logs_Intermediates/Annotation/{}/{}_SmallVariants_Annotated.json.gz", + } + + def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + if subpath_formats: + super().__init__(sample, root_path, subpath_formats) + else: + super().__init__(sample, root_path, self.default_subpath_formats) + + def parse(self): + """Parse the annotated JSON file""" + with open(self.path, 'r') as file: + self.data = msgspec.json.decode(file.read()) + return self.data From 34f614995fe07a07eef94a2bfab02741aa43801d Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Wed, 27 May 2026 14:05:20 +0200 Subject: [PATCH 02/10] chore: sort imports --- src/tsoppy/general/input_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py index 0469fde..0324f76 100644 --- a/src/tsoppy/general/input_classes.py +++ b/src/tsoppy/general/input_classes.py @@ -1,8 +1,9 @@ +import os from pathlib import Path from typing import Dict, Optional + import cyvcf2 import msgspec -import os import polars From 3c2e3edfaa10cbcbfc9217c5a04b96b32d863b43 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Wed, 27 May 2026 14:07:28 +0200 Subject: [PATCH 03/10] chore: lint --- src/tsoppy/general/input_classes.py | 44 +++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py index 0324f76..e89fdd9 100644 --- a/src/tsoppy/general/input_classes.py +++ b/src/tsoppy/general/input_classes.py @@ -25,7 +25,12 @@ class BaseInput: type: Optional[str] = None path: Optional[Path] = None - def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + def __init__( + self, + sample: str, + root_path: str | Path, + subpath_formats: Optional[Dict[str, str]] = None, + ): """Initialize the BaseInput.""" self.sample = sample self.root = Path(root_path) @@ -39,8 +44,9 @@ def _resolve_paths(self): """Resolve the paths for each workflow type based on the provided subpath formats.""" out: Dict[str, Path] = {} for name, fmt in self.subpath_formats.items(): - out[name] = Path(os.path.join( - self.root, fmt.format(self.sample, self.sample))) + out[name] = Path( + os.path.join(self.root, fmt.format(self.sample, self.sample)) + ) self.paths = out def _detect_type(self): @@ -48,10 +54,12 @@ def _detect_type(self): found = [name for name, path in self.paths.items() if path.is_file()] if len(found) > 1: raise ValueError( - f"Multiple workflow files found for sample {self.sample}: {found}") + f"Multiple workflow files found for sample {self.sample}: {found}" + ) if not found: raise FileNotFoundError( - f"No workflow file found for sample {self.sample}. Searched: {self.paths}") + f"No workflow file found for sample {self.sample}. Searched: {self.paths}" + ) self.type = found[0] self.path = self.paths[self.type] @@ -69,7 +77,12 @@ class Vcf(BaseInput): "localapp": "Logs_Intermediates/VariantMatching/{}/{}_MergedSmallVariants.genome.vcf", } - def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + def __init__( + self, + sample: str, + root_path: str | Path, + subpath_formats: Optional[Dict[str, str]] = None, + ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) else: @@ -93,7 +106,12 @@ class TmbTrace(BaseInput): "localapp": "Logs_Intermediates/Tmb/{}/{}_TMB_Trace.tsv", } - def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + def __init__( + self, + sample: str, + root_path: str | Path, + subpath_formats: Optional[Dict[str, str]] = None, + ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) else: @@ -101,8 +119,7 @@ def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional def parse(self): """Parse the TMB trace file""" - self.table = polars.read_csv( - self.path, separator="\t") + self.table = polars.read_csv(self.path, separator="\t") return self.table @@ -118,7 +135,12 @@ class AnnotatedJson(BaseInput): "localapp": "Logs_Intermediates/Annotation/{}/{}_SmallVariants_Annotated.json.gz", } - def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional[Dict[str, str]] = None): + def __init__( + self, + sample: str, + root_path: str | Path, + subpath_formats: Optional[Dict[str, str]] = None, + ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) else: @@ -126,6 +148,6 @@ def __init__(self, sample: str, root_path: str | Path, subpath_formats: Optional def parse(self): """Parse the annotated JSON file""" - with open(self.path, 'r') as file: + with open(self.path, "r") as file: self.data = msgspec.json.decode(file.read()) return self.data From 0c801aea4a422ca39678bd6397845eda04e2757b Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Wed, 27 May 2026 15:17:46 +0200 Subject: [PATCH 04/10] chore: replace Dict with Mapping --- src/tsoppy/general/input_classes.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py index e89fdd9..ad8ffb8 100644 --- a/src/tsoppy/general/input_classes.py +++ b/src/tsoppy/general/input_classes.py @@ -1,6 +1,7 @@ import os +from collections.abc import Mapping from pathlib import Path -from typing import Dict, Optional +from typing import Optional import cyvcf2 import msgspec @@ -14,14 +15,14 @@ class BaseInput: subpath format strings that accept `(sample, sample)` for formatting. Attributes: - paths: Mapping of workflow names to resolved Path objects (Dict[str, Path]) + paths: Mapping of workflow names to resolved Path objects (Mapping[str, Path]) root: Root path (Path) sample: Sample name (str) - subpath_formats: Mapping of workflow names to subpath format strings (Dict[str, str]) + subpath_formats: Mapping of workflow names to subpath format strings (Mapping[str, str]) type: Detected workflow type (str) """ - subpath_formats: Dict[str, str] = {} + subpath_formats: Mapping[str, str] = {} type: Optional[str] = None path: Optional[Path] = None @@ -29,7 +30,7 @@ def __init__( self, sample: str, root_path: str | Path, - subpath_formats: Optional[Dict[str, str]] = None, + subpath_formats: Optional[Mapping[str, str]] = None, ): """Initialize the BaseInput.""" self.sample = sample @@ -42,7 +43,7 @@ def __init__( def _resolve_paths(self): """Resolve the paths for each workflow type based on the provided subpath formats.""" - out: Dict[str, Path] = {} + out: Mapping[str, Path] = {} for name, fmt in self.subpath_formats.items(): out[name] = Path( os.path.join(self.root, fmt.format(self.sample, self.sample)) @@ -68,7 +69,7 @@ class Vcf(BaseInput): """Input class for VCF files produced by different workflows. Attributes: - default_subpath_formats: Mapping of workflow names to subpath format strings (Dict[str, str]) + default_subpath_formats: Mapping of workflow names to subpath format strings (Mapping[str, str]) vcf: Parsed VCF object (cyvcf2.VCF) """ @@ -81,7 +82,7 @@ def __init__( self, sample: str, root_path: str | Path, - subpath_formats: Optional[Dict[str, str]] = None, + subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) @@ -110,7 +111,7 @@ def __init__( self, sample: str, root_path: str | Path, - subpath_formats: Optional[Dict[str, str]] = None, + subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) @@ -139,7 +140,7 @@ def __init__( self, sample: str, root_path: str | Path, - subpath_formats: Optional[Dict[str, str]] = None, + subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: super().__init__(sample, root_path, subpath_formats) From 1cab661a06c099004cff8caba83eb730417916bb Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 28 May 2026 15:49:17 +0200 Subject: [PATCH 05/10] feat: rename sample to sample id --- src/tsoppy/general/input_classes.py | 33 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py index ad8ffb8..4320dee 100644 --- a/src/tsoppy/general/input_classes.py +++ b/src/tsoppy/general/input_classes.py @@ -12,12 +12,12 @@ class BaseInput: """Base class for inputs produced by different workflows (e.g. dragen/localapp). Subclasses should define `default_subpath_formats` mapping workflow names to - subpath format strings that accept `(sample, sample)` for formatting. + subpath format strings that accept `(sample_id, sample_id)` for formatting. Attributes: paths: Mapping of workflow names to resolved Path objects (Mapping[str, Path]) root: Root path (Path) - sample: Sample name (str) + sample_id: Sample ID (str) subpath_formats: Mapping of workflow names to subpath format strings (Mapping[str, str]) type: Detected workflow type (str) """ @@ -28,12 +28,12 @@ class BaseInput: def __init__( self, - sample: str, + sample_id: str, root_path: str | Path, subpath_formats: Optional[Mapping[str, str]] = None, ): """Initialize the BaseInput.""" - self.sample = sample + self.sample_id = sample_id self.root = Path(root_path) if subpath_formats: self.subpath_formats.update(subpath_formats) @@ -46,7 +46,8 @@ def _resolve_paths(self): out: Mapping[str, Path] = {} for name, fmt in self.subpath_formats.items(): out[name] = Path( - os.path.join(self.root, fmt.format(self.sample, self.sample)) + os.path.join(self.root, fmt.format( + self.sample_id, self.sample_id)) ) self.paths = out @@ -55,11 +56,11 @@ def _detect_type(self): found = [name for name, path in self.paths.items() if path.is_file()] if len(found) > 1: raise ValueError( - f"Multiple workflow files found for sample {self.sample}: {found}" + f"Multiple workflow files found for sample {self.sample_id}: {found}" ) if not found: raise FileNotFoundError( - f"No workflow file found for sample {self.sample}. Searched: {self.paths}" + f"No workflow file found for sample {self.sample_id}. Searched: {self.paths}" ) self.type = found[0] self.path = self.paths[self.type] @@ -80,14 +81,14 @@ class Vcf(BaseInput): def __init__( self, - sample: str, + sample_id: str, root_path: str | Path, subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: - super().__init__(sample, root_path, subpath_formats) + super().__init__(sample_id, root_path, subpath_formats) else: - super().__init__(sample, root_path, self.default_subpath_formats) + super().__init__(sample_id, root_path, self.default_subpath_formats) def parse(self): """Parse the VCF file""" @@ -109,14 +110,14 @@ class TmbTrace(BaseInput): def __init__( self, - sample: str, + sample_id: str, root_path: str | Path, subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: - super().__init__(sample, root_path, subpath_formats) + super().__init__(sample_id, root_path, subpath_formats) else: - super().__init__(sample, root_path, self.default_subpath_formats) + super().__init__(sample_id, root_path, self.default_subpath_formats) def parse(self): """Parse the TMB trace file""" @@ -138,14 +139,14 @@ class AnnotatedJson(BaseInput): def __init__( self, - sample: str, + sample_id: str, root_path: str | Path, subpath_formats: Optional[Mapping[str, str]] = None, ): if subpath_formats: - super().__init__(sample, root_path, subpath_formats) + super().__init__(sample_id, root_path, subpath_formats) else: - super().__init__(sample, root_path, self.default_subpath_formats) + super().__init__(sample_id, root_path, self.default_subpath_formats) def parse(self): """Parse the annotated JSON file""" From 89183e887338908671014ba20e518ddd31622809 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 11 Jun 2026 15:16:35 +0200 Subject: [PATCH 06/10] feat: workflow type and version are retrieved from MetricsOutput.tsv --- src/tsoppy/general/classes.py | 168 ++++++++++++++++++++++++++++ src/tsoppy/general/input_classes.py | 155 ------------------------- 2 files changed, 168 insertions(+), 155 deletions(-) create mode 100644 src/tsoppy/general/classes.py delete mode 100644 src/tsoppy/general/input_classes.py diff --git a/src/tsoppy/general/classes.py b/src/tsoppy/general/classes.py new file mode 100644 index 0000000..8499df5 --- /dev/null +++ b/src/tsoppy/general/classes.py @@ -0,0 +1,168 @@ +import logging +import os +from pathlib import Path + +import cyvcf2 +import msgspec +import polars + +from tsoppy.general.file_parser import Parse_section_tsv + +# Use logger that was set up in CLI +logger = logging.getLogger(__name__) + + +class WorkflowConfig(msgspec.Struct): + """Config class for workflow output file path format strings""" + + metrics_output_tsv: dict[str, str] + small_variant_genome_vcf: dict[str, str] + tmb_trace_tsv: dict[str, str] + variants_annotated_json: dict[str, str] + + +class WorkflowOutput: + """Base class for outputs produced by different workflows (e.g. dragen/localapp). + + Attributes: + config: Configuration (WorkflowConfig) + root: Root path (Path) + workflow_type: Detected workflow type (str) + workflow_version: Detected workflow version (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path): + """Initialize WorkflowOutput.""" + self.root = Path(root_path) + with open(config_yaml, "r") as yaml_file: + self.config = msgspec.yaml.decode(yaml_file.read(), type=WorkflowConfig) + + self._detect_type_and_version() + + def _detect_type_and_version(self): + """Detect which workflow type and version is present based on information in MetricsOutput.tsv.""" + info_src = list(self.config.metrics_output_tsv.values()) + if len(set(info_src)) != 1: + raise ValueError( + f"Got {info_src} but need exactly one file to detect workflow id" + ) + headers, sections = Parse_section_tsv( + os.path.join(self.root, info_src[0]), ["Header"] + ) + if "DRAGEN" in headers[0]: + self.workflow_type = "dragen" + else: + self.workflow_type = "localapp" + self.workflow_version = sections["Header"].item( + row=0, column="Workflow Version" + ) + + def workflow_id(self): + """Return combined workflow typ and version.""" + return f"{self.workflow_type}_{self.workflow_version}" + + +class SmallVariantGenomeVcf(WorkflowOutput): + """Input class for small variant genome VCF files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + sample_id: Sample identifier (str) + vcf: Parsed VCF object (cyvcf2.VCF) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + return obj + + def parse(self): + """Parse the VCF file""" + fmt = self.config.small_variant_genome_vcf[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + self.vcf = cyvcf2.VCF(self.path) + return self.vcf + + +class TmbTrace(WorkflowOutput): + """Input class for TMB trace files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + rows: Parsed rows of the TMB trace file (polars.DataFrame) + sample_id: Sample identifier (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + return obj + + def parse(self): + """Parse the TMB trace file""" + fmt = self.config.tmb_trace_tsv[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + self.table = polars.read_csv(self.path, separator="\t") + return self.table + + +class VariantsAnnotatedJson(WorkflowOutput): + """Input class for annotated JSON files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + data: Parsed JSON data (dict) + sample_id: Sample identifier (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + return obj + + def parse(self): + """Parse the variants annotated JSON file""" + fmt = self.config.variants_annotated_json[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + with open(self.path, "r") as file: + self.data = msgspec.json.decode(file.read()) + return self.data diff --git a/src/tsoppy/general/input_classes.py b/src/tsoppy/general/input_classes.py deleted file mode 100644 index 4320dee..0000000 --- a/src/tsoppy/general/input_classes.py +++ /dev/null @@ -1,155 +0,0 @@ -import os -from collections.abc import Mapping -from pathlib import Path -from typing import Optional - -import cyvcf2 -import msgspec -import polars - - -class BaseInput: - """Base class for inputs produced by different workflows (e.g. dragen/localapp). - - Subclasses should define `default_subpath_formats` mapping workflow names to - subpath format strings that accept `(sample_id, sample_id)` for formatting. - - Attributes: - paths: Mapping of workflow names to resolved Path objects (Mapping[str, Path]) - root: Root path (Path) - sample_id: Sample ID (str) - subpath_formats: Mapping of workflow names to subpath format strings (Mapping[str, str]) - type: Detected workflow type (str) - """ - - subpath_formats: Mapping[str, str] = {} - type: Optional[str] = None - path: Optional[Path] = None - - def __init__( - self, - sample_id: str, - root_path: str | Path, - subpath_formats: Optional[Mapping[str, str]] = None, - ): - """Initialize the BaseInput.""" - self.sample_id = sample_id - self.root = Path(root_path) - if subpath_formats: - self.subpath_formats.update(subpath_formats) - - self._resolve_paths() - self._detect_type() - - def _resolve_paths(self): - """Resolve the paths for each workflow type based on the provided subpath formats.""" - out: Mapping[str, Path] = {} - for name, fmt in self.subpath_formats.items(): - out[name] = Path( - os.path.join(self.root, fmt.format( - self.sample_id, self.sample_id)) - ) - self.paths = out - - def _detect_type(self): - """Detect which workflow type is present based on the existence of the resolved paths.""" - found = [name for name, path in self.paths.items() if path.is_file()] - if len(found) > 1: - raise ValueError( - f"Multiple workflow files found for sample {self.sample_id}: {found}" - ) - if not found: - raise FileNotFoundError( - f"No workflow file found for sample {self.sample_id}. Searched: {self.paths}" - ) - self.type = found[0] - self.path = self.paths[self.type] - - -class Vcf(BaseInput): - """Input class for VCF files produced by different workflows. - - Attributes: - default_subpath_formats: Mapping of workflow names to subpath format strings (Mapping[str, str]) - vcf: Parsed VCF object (cyvcf2.VCF) - """ - - default_subpath_formats = { - "dragen": "Logs_Intermediates/DnaDragenCaller/{}/{}.hard-filtered.gvcf.gz", - "localapp": "Logs_Intermediates/VariantMatching/{}/{}_MergedSmallVariants.genome.vcf", - } - - def __init__( - self, - sample_id: str, - root_path: str | Path, - subpath_formats: Optional[Mapping[str, str]] = None, - ): - if subpath_formats: - super().__init__(sample_id, root_path, subpath_formats) - else: - super().__init__(sample_id, root_path, self.default_subpath_formats) - - def parse(self): - """Parse the VCF file""" - self.vcf = cyvcf2.VCF(self.path) - return self.vcf - - -class TmbTrace(BaseInput): - """Input class for TMB trace files produced by different workflows. - - Attributes: - rows: Parsed rows of the TMB trace file (polars.DataFrame) - """ - - default_subpath_formats = { - "dragen": "Logs_Intermediates/Tmb/{}/{}.tmb.trace.tsv", - "localapp": "Logs_Intermediates/Tmb/{}/{}_TMB_Trace.tsv", - } - - def __init__( - self, - sample_id: str, - root_path: str | Path, - subpath_formats: Optional[Mapping[str, str]] = None, - ): - if subpath_formats: - super().__init__(sample_id, root_path, subpath_formats) - else: - super().__init__(sample_id, root_path, self.default_subpath_formats) - - def parse(self): - """Parse the TMB trace file""" - self.table = polars.read_csv(self.path, separator="\t") - return self.table - - -class AnnotatedJson(BaseInput): - """Input class for annotated JSON files produced by different workflows. - - Attributes: - data: Parsed JSON data (dict) - """ - - default_subpath_formats = { - "dragen": "Logs_Intermediates/Annotation/{}/{}_DNAVariants_Annotated.json", - "localapp": "Logs_Intermediates/Annotation/{}/{}_SmallVariants_Annotated.json.gz", - } - - def __init__( - self, - sample_id: str, - root_path: str | Path, - subpath_formats: Optional[Mapping[str, str]] = None, - ): - if subpath_formats: - super().__init__(sample_id, root_path, subpath_formats) - else: - super().__init__(sample_id, root_path, self.default_subpath_formats) - - def parse(self): - """Parse the annotated JSON file""" - with open(self.path, "r") as file: - self.data = msgspec.json.decode(file.read()) - return self.data From f3713983fa7c680f7a8aafbb4c6f939812e04953 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 11 Jun 2026 15:17:12 +0200 Subject: [PATCH 07/10] chore: add config yaml containing subpath format strings for workflow files --- config.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 config.yaml diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..3563fb1 --- /dev/null +++ b/config.yaml @@ -0,0 +1,12 @@ +metrics_output_tsv: + dragen_2.6.2.4: Logs_Intermediates/MetricsOutput/MetricsOutput.tsv + localapp_ruo-2.2.0.12: Logs_Intermediates/MetricsOutput/MetricsOutput.tsv +small_variant_genome_vcf: + dragen_2.6.2.4: Logs_Intermediates/DnaDragenCaller/{}/{}.hard-filtered.gvcf.gz + localapp_ruo-2.2.0.12: Logs_Intermediates/VariantMatching/{}/{}_MergedSmallVariants.genome.vcf +tmb_trace_tsv: + dragen_2.6.2.4: Logs_Intermediates/Tmb/{}/{}.tmb.trace.tsv + localapp_ruo-2.2.0.12: Logs_Intermediates/Tmb/{}/{}_TMB_Trace.tsv +variants_annotated_json: + dragen_2.6.2.4: Logs_Intermediates/Annotation/{}/{}_DNAVariants_Annotated.json + localapp_ruo-2.2.0.12: Logs_Intermediates/Annotation/{}/{}_SmallVariants_Annotated.json.gz \ No newline at end of file From 5d0d0359f347188af383781e9c4bfe21b3f220d2 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Fri, 12 Jun 2026 10:15:20 +0200 Subject: [PATCH 08/10] docs: include more docstrings and expand existing ones --- src/tsoppy/general/classes.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/tsoppy/general/classes.py b/src/tsoppy/general/classes.py index 8499df5..a9dfe6e 100644 --- a/src/tsoppy/general/classes.py +++ b/src/tsoppy/general/classes.py @@ -41,24 +41,32 @@ def __init__(self, config_yaml: str | Path, root_path: str | Path): def _detect_type_and_version(self): """Detect which workflow type and version is present based on information in MetricsOutput.tsv.""" + + # Get all values for MetricsOutput.tsv paths and check if they are the same info_src = list(self.config.metrics_output_tsv.values()) if len(set(info_src)) != 1: raise ValueError( f"Got {info_src} but need exactly one file to detect workflow id" ) + + # Parse MetricsOutput.tsv headers, sections = Parse_section_tsv( os.path.join(self.root, info_src[0]), ["Header"] ) + + # Check if DRAGEN is part of the header and assume the data is localapp if not if "DRAGEN" in headers[0]: self.workflow_type = "dragen" else: self.workflow_type = "localapp" + + # Set workflow version from Header section self.workflow_version = sections["Header"].item( row=0, column="Workflow Version" ) def workflow_id(self): - """Return combined workflow typ and version.""" + """Return combined string for workflow type and version.""" return f"{self.workflow_type}_{self.workflow_version}" @@ -72,11 +80,13 @@ class SmallVariantGenomeVcf(WorkflowOutput): """ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize SmallVariantGenomeVcf""" super().__init__(config_yaml, root_path) self.sample_id = sample_id @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create SmallVariantGenomeVcf from existing WorkflowOutput""" obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id @@ -107,11 +117,13 @@ class TmbTrace(WorkflowOutput): """ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize TmTraceTsv.""" super().__init__(config_yaml, root_path) self.sample_id = sample_id @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create TmbTraceTsv from existing WorkflowOutput.""" obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id @@ -142,11 +154,13 @@ class VariantsAnnotatedJson(WorkflowOutput): """ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize VariantsAnnotatedJson.""" super().__init__(config_yaml, root_path) self.sample_id = sample_id @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create VariantsAnnotatedJson from existing WorkflowOutput.""" obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id From 404c1d37779056d6fd5729cbaabeeb929ac16af1 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Fri, 12 Jun 2026 10:16:23 +0200 Subject: [PATCH 09/10] feat: make file parsing part of the class instance init process --- src/tsoppy/general/classes.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/tsoppy/general/classes.py b/src/tsoppy/general/classes.py index a9dfe6e..302f21f 100644 --- a/src/tsoppy/general/classes.py +++ b/src/tsoppy/general/classes.py @@ -83,6 +83,7 @@ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: st """Initialize SmallVariantGenomeVcf""" super().__init__(config_yaml, root_path) self.sample_id = sample_id + self._parse() @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): @@ -90,10 +91,11 @@ def create(cls, workflow_output: WorkflowOutput, sample_id: str): obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id + obj._parse() return obj - def parse(self): - """Parse the VCF file""" + def _parse(self): + """Parse the small variant genome VCF file""" fmt = self.config.small_variant_genome_vcf[self.workflow_id()] self.path = Path( os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) @@ -104,7 +106,6 @@ def parse(self): ) raise FileNotFoundError self.vcf = cyvcf2.VCF(self.path) - return self.vcf class TmbTrace(WorkflowOutput): @@ -120,6 +121,7 @@ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: st """Initialize TmTraceTsv.""" super().__init__(config_yaml, root_path) self.sample_id = sample_id + self._parse() @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): @@ -127,10 +129,11 @@ def create(cls, workflow_output: WorkflowOutput, sample_id: str): obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id + obj._parse() return obj - def parse(self): - """Parse the TMB trace file""" + def _parse(self): + """Parse the TMB trace tsv.""" fmt = self.config.tmb_trace_tsv[self.workflow_id()] self.path = Path( os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) @@ -141,7 +144,6 @@ def parse(self): ) raise FileNotFoundError self.table = polars.read_csv(self.path, separator="\t") - return self.table class VariantsAnnotatedJson(WorkflowOutput): @@ -157,6 +159,7 @@ def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: st """Initialize VariantsAnnotatedJson.""" super().__init__(config_yaml, root_path) self.sample_id = sample_id + self._parse() @classmethod def create(cls, workflow_output: WorkflowOutput, sample_id: str): @@ -164,9 +167,10 @@ def create(cls, workflow_output: WorkflowOutput, sample_id: str): obj = cls.__new__(cls) obj.__dict__.update(workflow_output.__dict__) obj.sample_id = sample_id + obj._parse() return obj - def parse(self): + def _parse(self): """Parse the variants annotated JSON file""" fmt = self.config.variants_annotated_json[self.workflow_id()] self.path = Path( @@ -179,4 +183,3 @@ def parse(self): raise FileNotFoundError with open(self.path, "r") as file: self.data = msgspec.json.decode(file.read()) - return self.data From cf87d15929bd812594160d0dca09049b2212af67 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Fri, 12 Jun 2026 10:16:42 +0200 Subject: [PATCH 10/10] feat: reanem TmbTrace to TmbTraceTsv --- src/tsoppy/general/classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tsoppy/general/classes.py b/src/tsoppy/general/classes.py index 302f21f..9f01dcd 100644 --- a/src/tsoppy/general/classes.py +++ b/src/tsoppy/general/classes.py @@ -108,7 +108,7 @@ def _parse(self): self.vcf = cyvcf2.VCF(self.path) -class TmbTrace(WorkflowOutput): +class TmbTraceTsv(WorkflowOutput): """Input class for TMB trace files produced by different workflows. Attributes: