diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..3563fb1 --- /dev/null +++ b/config.yaml @@ -0,0 +1,12 @@ +metrics_output_tsv: + dragen_2.6.2.4: Logs_Intermediates/MetricsOutput/MetricsOutput.tsv + localapp_ruo-2.2.0.12: Logs_Intermediates/MetricsOutput/MetricsOutput.tsv +small_variant_genome_vcf: + dragen_2.6.2.4: Logs_Intermediates/DnaDragenCaller/{}/{}.hard-filtered.gvcf.gz + localapp_ruo-2.2.0.12: Logs_Intermediates/VariantMatching/{}/{}_MergedSmallVariants.genome.vcf +tmb_trace_tsv: + dragen_2.6.2.4: Logs_Intermediates/Tmb/{}/{}.tmb.trace.tsv + localapp_ruo-2.2.0.12: Logs_Intermediates/Tmb/{}/{}_TMB_Trace.tsv +variants_annotated_json: + dragen_2.6.2.4: Logs_Intermediates/Annotation/{}/{}_DNAVariants_Annotated.json + localapp_ruo-2.2.0.12: Logs_Intermediates/Annotation/{}/{}_SmallVariants_Annotated.json.gz \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d284de1..78cfce8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "cyvcf2>=0.32.1", "msgspec>=0.21.1", "polars>=1.39.3", + "pyyaml>=6.0.3", "typer>=0.24.1", ] dynamic = ["version"] diff --git a/src/tsoppy/general/classes.py b/src/tsoppy/general/classes.py new file mode 100644 index 0000000..d1f60bd --- /dev/null +++ b/src/tsoppy/general/classes.py @@ -0,0 +1,207 @@ +import logging +import os +from pathlib import Path + +import cyvcf2 +import msgspec +import polars + +from tsoppy.general.file_parser import Parse_section_tsv + +# Use logger that was set up in CLI +logger = logging.getLogger(__name__) + + +class WorkflowConfig(msgspec.Struct): + """Config class for workflow output file path format strings""" + + metrics_output_tsv: dict[str, str] + small_variant_genome_vcf: dict[str, str] + tmb_trace_tsv: dict[str, str] + variants_annotated_json: dict[str, str] + + def __eq__(self, other): + if not isinstance(other, WorkflowConfig): + return False + if self.metrics_output_tsv != other.metrics_output_tsv: + return False + if self.small_variant_genome_vcf != other.small_variant_genome_vcf: + return False + if self.tmb_trace_tsv != other.tmb_trace_tsv: + return False + return self.variants_annotated_json == other.variants_annotated_json + + +class WorkflowOutput: + """Base class for outputs produced by different workflows (e.g. dragen/localapp). + + Attributes: + config: Configuration (WorkflowConfig) + root: Root path (Path) + workflow_type: Detected workflow type (str) + workflow_version: Detected workflow version (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path): + """Initialize WorkflowOutput.""" + self.root = Path(root_path) + with open(config_yaml, "r") as yaml_file: + self.config = msgspec.yaml.decode(yaml_file.read(), type=WorkflowConfig) + + self._detect_type_and_version() + + def _detect_type_and_version(self): + """Detect which workflow type and version is present based on information in MetricsOutput.tsv.""" + + # Get all values for MetricsOutput.tsv paths and check if they are the same + info_src = list(self.config.metrics_output_tsv.values()) + if len(set(info_src)) != 1: + raise ValueError( + f"Got {info_src} but need exactly one file to detect workflow id" + ) + + # Parse MetricsOutput.tsv + headers, sections = Parse_section_tsv( + os.path.join(self.root, info_src[0]), ["Header"] + ) + + # Check if DRAGEN is part of the header and assume the data is localapp if not + if "DRAGEN" in headers[0]: + self.workflow_type = "dragen" + else: + self.workflow_type = "localapp" + + # Set workflow version from Header section + self.workflow_version = sections["Header"].item( + row=0, column="Workflow Version" + ) + + def __eq__(self, other): + if not isinstance(other, WorkflowOutput): + return False + if self.config != other.config: + return False + if self.root != other.root: + return False + if self.workflow_type != other.workflow_type: + return False + return self.workflow_version == other.workflow_version + + def workflow_id(self): + """Return combined string for workflow type and version.""" + return f"{self.workflow_type}_{self.workflow_version}" + + +class SmallVariantGenomeVcf(WorkflowOutput): + """Input class for small variant genome VCF files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + sample_id: Sample identifier (str) + vcf: Parsed VCF object (cyvcf2.VCF) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize SmallVariantGenomeVcf""" + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + self._parse() + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create SmallVariantGenomeVcf from existing WorkflowOutput""" + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + obj._parse() + return obj + + def _parse(self): + """Parse the small variant genome VCF file""" + fmt = self.config.small_variant_genome_vcf[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + self.vcf = cyvcf2.VCF(self.path) + + +class TmbTraceTsv(WorkflowOutput): + """Input class for TMB trace files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + rows: Parsed rows of the TMB trace file (polars.DataFrame) + sample_id: Sample identifier (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize TmTraceTsv.""" + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + self._parse() + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create TmbTraceTsv from existing WorkflowOutput.""" + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + obj._parse() + return obj + + def _parse(self): + """Parse the TMB trace tsv.""" + fmt = self.config.tmb_trace_tsv[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + self.table = polars.read_csv(self.path, separator="\t") + + +class VariantsAnnotatedJson(WorkflowOutput): + """Input class for annotated JSON files produced by different workflows. + + Attributes: + path: Path to vcf (Path) + data: Parsed JSON data (dict) + sample_id: Sample identifier (str) + """ + + def __init__(self, config_yaml: str | Path, root_path: str | Path, sample_id: str): + """Initialize VariantsAnnotatedJson.""" + super().__init__(config_yaml, root_path) + self.sample_id = sample_id + self._parse() + + @classmethod + def create(cls, workflow_output: WorkflowOutput, sample_id: str): + """Create VariantsAnnotatedJson from existing WorkflowOutput.""" + obj = cls.__new__(cls) + obj.__dict__.update(workflow_output.__dict__) + obj.sample_id = sample_id + obj._parse() + return obj + + def _parse(self): + """Parse the variants annotated JSON file""" + fmt = self.config.variants_annotated_json[self.workflow_id()] + self.path = Path( + os.path.join(self.root, fmt.format(self.sample_id, self.sample_id)) + ) + if not self.path.is_file(): + logging.error( + f"Small variant genome VCF missing: File {self.path} does not exist." + ) + raise FileNotFoundError + with open(self.path, "r") as file: + self.data = msgspec.json.decode(file.read()) diff --git a/tests/general_classes_test.py b/tests/general_classes_test.py new file mode 100644 index 0000000..4bf9f84 --- /dev/null +++ b/tests/general_classes_test.py @@ -0,0 +1,46 @@ +from os import path +from contextlib import nullcontext +from pytest import mark, raises + +from tsoppy.general.classes import ( + WorkflowOutput, + SmallVariantGenomeVcf +) + +# Define path to test data - cannot be absolute due to different paths locally and in CI +test_data_dir = "tests/test_data/general_classes" + + +@mark.parametrize( + "inputs, exception, want", + [ + ( + ( + 'config.yaml', + path.join(test_data_dir, "dragen/standard") + ), + nullcontext(), + 'dragen_2.6.2.4' + ), + ( + ( + 'config.yaml', + path.join(test_data_dir, "localapp/standard") + ), + nullcontext(), + 'localapp_ruo-2.2.0.12' + ), + ( + ( + 'config.yaml', + path.join(test_data_dir, "localapp/non-existent") + ), + raises(FileNotFoundError), + '' + ) + ] +) +def test_workflowoutput_init(inputs, exception, want): + with exception: + got = WorkflowOutput(inputs[0], inputs[1]) + assert got.workflow_id() == want diff --git a/tests/general_file_parser_test.py b/tests/general_file_parser_test.py index 354115f..500d66e 100644 --- a/tests/general_file_parser_test.py +++ b/tests/general_file_parser_test.py @@ -12,7 +12,7 @@ ) # Define path to test data - cannot be absolute due to different paths locally and in CI -test_data_dir = "tests/test_data/general" +test_data_dir = "tests/test_data/general_file_parser" @mark.parametrize( diff --git a/tests/test_data/general_classes/dragen/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv b/tests/test_data/general_classes/dragen/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv new file mode 100644 index 0000000..13f0c81 --- /dev/null +++ b/tests/test_data/general_classes/dragen/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv @@ -0,0 +1,7 @@ +DRAGEN TruSight Oncology 500 v2.6.2 Analysis Software - Metrics Output +For Research Use Only. Not for use in diagnostic procedures. + +[Header] +Output Date 2020-01-01 +Output Time 01:00:00 +Workflow Version 2.6.2.4 \ No newline at end of file diff --git a/tests/test_data/general_classes/localapp/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv b/tests/test_data/general_classes/localapp/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv new file mode 100644 index 0000000..a36c937 --- /dev/null +++ b/tests/test_data/general_classes/localapp/standard/Logs_Intermediates/MetricsOutput/MetricsOutput.tsv @@ -0,0 +1,7 @@ +TruSight Oncology 500 - Metrics Output +For Research Use Only. Not for use in diagnostic procedures. + +[Header] +Output Date 2020-01-01 +Output Time 01:00:00 +Workflow Version ruo-2.2.0.12 \ No newline at end of file diff --git a/tests/test_data/general/parse_section_tsv/empty.tsv b/tests/test_data/general_file_parser/parse_section_tsv/empty.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/empty.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/empty.tsv diff --git a/tests/test_data/general/parse_section_tsv/empty_first_column_name.tsv b/tests/test_data/general_file_parser/parse_section_tsv/empty_first_column_name.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/empty_first_column_name.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/empty_first_column_name.tsv diff --git a/tests/test_data/general/parse_section_tsv/extra_empty_lines.tsv b/tests/test_data/general_file_parser/parse_section_tsv/extra_empty_lines.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/extra_empty_lines.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/extra_empty_lines.tsv diff --git a/tests/test_data/general/parse_section_tsv/key_value.tsv b/tests/test_data/general_file_parser/parse_section_tsv/key_value.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/key_value.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/key_value.tsv diff --git a/tests/test_data/general/parse_section_tsv/multiple_sections.tsv b/tests/test_data/general_file_parser/parse_section_tsv/multiple_sections.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/multiple_sections.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/multiple_sections.tsv diff --git a/tests/test_data/general/parse_section_tsv/no_headers.tsv b/tests/test_data/general_file_parser/parse_section_tsv/no_headers.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/no_headers.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/no_headers.tsv diff --git a/tests/test_data/general/parse_section_tsv/null_columns.tsv b/tests/test_data/general_file_parser/parse_section_tsv/null_columns.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/null_columns.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/null_columns.tsv diff --git a/tests/test_data/general/parse_section_tsv/standard.tsv b/tests/test_data/general_file_parser/parse_section_tsv/standard.tsv similarity index 100% rename from tests/test_data/general/parse_section_tsv/standard.tsv rename to tests/test_data/general_file_parser/parse_section_tsv/standard.tsv