From 2011b984ad51e4727aea764526718e69bec6fa0a Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:18:15 +0200 Subject: [PATCH 01/20] chore: add pandas to deps --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1107b88..90e9a51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"] build-backend = "setuptools.build_meta" [project] -dependencies = ["typer>=0.24.1"] +dependencies = ["pandas>=3.0.1", "typer>=0.24.1"] dynamic = ["version"] name = "tsoppy" requires-python = ">=3.14" From b178f0f1ffaec23cad7712d6a0e9beb56dee87f6 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:19:21 +0200 Subject: [PATCH 02/20] feat: add update small variant vcf list subpackage --- .../update_small_variant_vcf_list/__init__.py | 0 .../update_small_variant_vcf_list/main.py | 179 ++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 src/tsoppy/update_small_variant_vcf_list/__init__.py create mode 100644 src/tsoppy/update_small_variant_vcf_list/main.py diff --git a/src/tsoppy/update_small_variant_vcf_list/__init__.py b/src/tsoppy/update_small_variant_vcf_list/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py new file mode 100644 index 0000000..b906a18 --- /dev/null +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -0,0 +1,179 @@ +""" +This module contains the code for the `update_small_variant_vcf_list` command. +The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored. +""" + +import glob +import logging +import re +from datetime import datetime +from pathlib import Path + +import pandas + +# Use logger that was set up in CLI +logger = logging.getLogger(__name__) + + +class VcfList: + """ + Represents small variant VCF list. + + Attributes: + dataframe (Dataframe): Dataframe representing the current version of small variant VCF list. + inpred_id_regex (str): Regular expression matching InPreD IDs. + output (str): Path to updated version of small variant VCF list. + tumor_sample_types (set[str]): Single letter codes representing a tumor sample. + vcf_list_columns (list[str]): List of dataframe column names. + vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. + """ + vcf_list_columns = ["vcf", "sample_type"] + + def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, inpred_id_regex: str, tumor_sample_types: str, output: str): + """ + Create new instance of SmallVariantVcfList. + """ + self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}") + self.inpred_id_regex = rf"{inpred_id_regex}" + self.tumor_sample_types = set(tumor_sample_types.split(",")) + self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns) + + # Try reading small variant VCF list or start from scratch + if vcf_list: + try: + self.dataframe = pandas.read_csv( + vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines='warn') + except FileNotFoundError: + logger.warning( + f"{vcf_list} not found, creating new small variant VCF list.") + else: + logger.info( + f"no small variant VCF list specified, creating new one.") + + # Replace placeholder with actual date + if "" in output: + now = datetime.now() + self.output = output.replace("", now.strftime("%Y%m%d")) + else: + self.output = output + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, VcfList): + return NotImplemented + if self.dataframe != other.dataframe: + return False + if self.inpred_id_regex != other.inpred_id_regex: + return False + if self.output != other.output: + return False + if self.tumor_sample_types != other.tumor_sample_types: + return False + return self.vcfs != other.vcfs + + def update(self): + """ + Add VCF(s) from results directory to small variant VCF list. + """ + + # Loop over all small variant VCFs + for vcf in self.vcfs: + + # Avoid duplication + if vcf in self.dataframe["vcf"].values: + logger.warning( + f"{vcf} is already in small variant VCF list, skipping.") + continue + + # Parse InPreD ID to get patient ID and sample type + match = re.search(self.inpred_id_regex, vcf) + try: + patient_id = match.group("patient_id") + sample_type = match.group("sample_type") + except AttributeError: + logger.warning( + f"could not parse InPreD ID from {vcf}, skipping.") + continue + + # Check if VCF is eligible for small variant VCF list and add if yes + small_variant_vcf = Vcf( + vcf, patient_id, sample_type, self.tumor_sample_types) + if not small_variant_vcf.include: + continue + else: + self.dataframe.loc[len(self.dataframe) + ] = small_variant_vcf.row() + + # Check if new patient ID is represented multiple times + patient_sample_count = self.dataframe["vcf"].str.contains( + patient_id).sum() + if patient_sample_count > 1: + logger.warning( + f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list.") + + # Write updated small variant VCF list to file + self.dataframe.drop_duplicates().to_csv( + self.output, sep="\t", header=False, index=False) + + +class Vcf: + """ + Represents small variant VCF. + + Attributes: + include (bool): Whether to add the vcf to the small variant VCF file or not. + patient_id (str): ID of patient that the VCF belongs to. + sample_type (str): Single letter code representing type of sample, e.g. T = tumor. + vcf (str): Path to VCF file. + """ + include = True + + def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set): + """ + Create new instance of SmallVariantVcf. + """ + self.vcf = vcf + self.patient_id = patient_id + + # Exclude control sample starting with IPC + if patient_id.startswith("IPC"): + logger.warning(f"{self.patient_id} is a control sample, skipping.") + self.include = False + return + + # Ensure sample type is N or included in tumor_sample_types + if sample_type != "T" and sample_type != "N": + if sample_type not in tumor_sample_types: + logger.warning( + f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping.") + self.include = False + return + else: + # Reset any sample type in tumor_sample_types with T + logger.warning( + f"sample type code {sample_type} for {vcf} will be replaced with T") + self.sample_type = "T" + else: + self.sample_type = sample_type + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, Vcf): + return NotImplemented + if self.include != other.include: + return False + if self.patient_id != other.patient_id: + return False + if self.sample_type != other.sample_type: + return False + return self.vcf != other.vcf + + def row(self): + """ + Return small variant VCF list row. + """ + return [self.vcf, self.sample_type] From 74d454151ec698e3c7007665052ec1eb9f06450a Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:20:49 +0200 Subject: [PATCH 03/20] test: add tests and test data for update small variant vcf list subpackage --- tests/cli_test.py | 10 -- ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list.tsv | 0 .../TSO500_vcf_list_expected.tsv | 0 ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../skip_existing_vcf/TSO500_vcf_list.tsv | 1 + .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list.tsv | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...test_update_small_variant_vcf_list_main.py | 135 ++++++++++++++++++ 15 files changed, 140 insertions(+), 10 deletions(-) delete mode 100644 tests/cli_test.py create mode 100644 tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_update_small_variant_vcf_list_main.py diff --git a/tests/cli_test.py b/tests/cli_test.py deleted file mode 100644 index 7b8ee75..0000000 --- a/tests/cli_test.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -CLI module unit tests for tsoppy. -""" - - -def test_placeholder(): - """ - Unit test for the placeholder command in the CLI module. - """ - assert True diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..2de322d --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..91a1e5c --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..682e578 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py new file mode 100644 index 0000000..857f166 --- /dev/null +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -0,0 +1,135 @@ +""" +update small variant vcf list subpackage main module unit tests. +""" +from os import path +import os +import filecmp +import unittest +from tsoppy.update_small_variant_vcf_list.main import VcfList, Vcf + +# Define path to test data - cannot be absolute due to different paths locally and in CI +test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" + +# test constants +glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" +tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" +inpred_id_regex = "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" + + +class TestVcfList(unittest.TestCase): + def test_update(self): + test_cases = [ + { + "name": "successfully update small variant vcf list", + "results_dir": path.join(test_data_dir, "successfully_update_small_variant_vcf_list"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "create new small variant vcf list", + "results_dir": path.join(test_data_dir, "create_new_small_variant_vcf_list"), + "glob_pattern": glob_pattern, + "vcf_list": None, + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "small variant vcf list does not exist", + "results_dir": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "skip existing vcf", + "results_dir": path.join(test_data_dir, "skip_existing_vcf"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "inpred id not parsable", + "results_dir": path.join(test_data_dir, "inpred_id_not_parsable"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv"), + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + got = VcfList( + test_case["results_dir"], test_case["glob_pattern"], test_case["vcf_list"], test_case["inpred_id_regex"], test_case["tumor_sample_types"], test_case["output"]) + got.update() + assert filecmp.cmp(test_case["output"], test_case["expected"]) + os.remove(test_case["output"]) + + +class TestVcf(unittest.TestCase): + def test_init(self): + test_cases = [ + { + "name": "include sample", + "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": True, + }, + { + "name": "sample is control", + "vcf": "IPC0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPC0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": False, + }, + { + "name": "sample is neither tumor nor normal", + "vcf": "IPH0001-01-A01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "A", + "tumor_sample_types": tumor_sample_types, + "expected": False, + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + got = Vcf( + test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + assert got.include == test_case["expected"] + + def test_row(self): + test_cases = [ + { + "name": "successfully return row", + "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": ["IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", "T"], + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + vcf = Vcf( + test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + got = vcf.row() + assert got == test_case["expected"] From a55e39cdfbeee29025b73271d8d4c5be052d9bf7 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:22:43 +0200 Subject: [PATCH 04/20] feat: add logger and set defaults --- src/tsoppy/cli.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 3c0030a..8b23d47 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -3,10 +3,17 @@ """ import importlib.metadata +import logging from typing import Annotated import typer +# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') +logger = logging.getLogger(__name__) + app = typer.Typer() app_version = importlib.metadata.version("tsoppy") From 141c519bcc0576065be9964a563adaecedcbc436 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:25:13 +0200 Subject: [PATCH 05/20] feat: expose update small variant vcf list subpackage via command in cli --- src/tsoppy/cli.py | 64 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 8b23d47..9ca08e8 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -4,10 +4,14 @@ import importlib.metadata import logging +import re +from pathlib import Path from typing import Annotated import typer +from tsoppy.update_small_variant_vcf_list.main import VcfList + # Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', @@ -26,17 +30,57 @@ def version(): print(f"tsoppy version {app_version}") +def glob_pattern_callback(value: str) -> str: + """ + Callback function checking that the glob pattern ends with '.vcf'. + """ + if not value.endswith(".vcf"): + raise typer.BadParameter("Glob pattern must end with '.vcf'.") + return value + + +def inpred_id_regex_callback(value: str) -> str: + """ + Callback function ensuring inpred_id_regex contains the required named capture groups. + """ + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'patient_id'.") + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'sample_type'.") + return value + + +def tumor_sample_types_callback(value: str) -> str: + """ + Callback function to ensure tumor_sample_types is a comma-separated list of single letters. + """ + if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value): + raise typer.BadParameter( + "tumor_sample_types must be comma-separated list of single letters.") + return value + + @app.command() -def placeholder( - user_name: Annotated[str, typer.Option("--name", "-n")], - user_id: Annotated[str, typer.Option("--id", "-i")], - verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False, +def update_small_variant_vcf_list( + results_dir: Annotated[Path | None, typer.Option(help="Directory where the results of the latest TSO500 run are stored.")], + glob_pattern: Annotated[str, typer.Option( + help="Glob pattern to search for small variant VCF files in the results directory.", callback=glob_pattern_callback)] = "**/Results/**/*_MergedSmallVariants.genome.vcf", + inpred_id_regex: Annotated[str, typer.Option( + help="Regular expression to extract the inpred_id from the VCF file name.", callback=inpred_id_regex_callback)] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + output: Annotated[str, typer.Option( + help="Name of new small variant VCF list.")] = f"small_variant_vcf_list_.tsv", + tumor_sample_types: Annotated[str, typer.Option( + help="Comma-separated list of sample types that are considered tumor samples.")] = "C,D,d,L,M,P,p,R,r,T,X", + vcf_list: Annotated[Path | None, typer.Option( + help="Path to list of small variant VCF files.")] = None, ): """ - This is the helptext for the placeholder command that demonstrates how to - use Typer for CLI applications. + Updates the small variant VCF list based on VCF(s) in results directory. """ - if verbose: - print(f"{user_name} has the following id: {user_id}") - else: - print(f"{user_name}: {user_id}") + logger.info("Start updating small variant VCF list.") + small_variant_vcf_list = VcfList( + results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output) + small_variant_vcf_list.update() + logger.info("Finished updating small variant VCF list.") From 02fb98fed777ea76268209de79fa1ce33fc98ed1 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:25:37 +0200 Subject: [PATCH 06/20] style: include comments --- src/tsoppy/cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 9ca08e8..eb0b590 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -18,7 +18,10 @@ datefmt='%Y/%m/%d %H:%M:%S') logger = logging.getLogger(__name__) +# Create a Typer app for the CLI. The app will be used to define the commands and their arguments. app = typer.Typer() + +# app_version will be set from git tag. app_version = importlib.metadata.version("tsoppy") From 8f625af922e0f163d13a6ebea1c42d7cb435ff45 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:26:39 +0200 Subject: [PATCH 07/20] chore: lint packages --- tests/test_update_small_variant_vcf_list_main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index 857f166..c069cfe 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -1,11 +1,12 @@ """ update small variant vcf list subpackage main module unit tests. """ -from os import path -import os import filecmp +import os import unittest -from tsoppy.update_small_variant_vcf_list.main import VcfList, Vcf +from os import path + +from tsoppy.update_small_variant_vcf_list.main import Vcf, VcfList # Define path to test data - cannot be absolute due to different paths locally and in CI test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" From 938c6e769f02f59ff581fef71b5b655ff6d3f133 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:33:53 +0200 Subject: [PATCH 08/20] chore: ruff lint --- src/tsoppy/cli.py | 63 +++++++---- .../update_small_variant_vcf_list/main.py | 53 +++++---- ...test_update_small_variant_vcf_list_main.py | 101 ++++++++++++++---- 3 files changed, 158 insertions(+), 59 deletions(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index eb0b590..8dabc70 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -13,9 +13,11 @@ from tsoppy.update_small_variant_vcf_list.main import VcfList # Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", +) logger = logging.getLogger(__name__) # Create a Typer app for the CLI. The app will be used to define the commands and their arguments. @@ -48,10 +50,12 @@ def inpred_id_regex_callback(value: str) -> str: """ if "" not in value: raise typer.BadParameter( - "inpred_id_regex must contain a named group 'patient_id'.") + "inpred_id_regex must contain a named group 'patient_id'." + ) if "" not in value: raise typer.BadParameter( - "inpred_id_regex must contain a named group 'sample_type'.") + "inpred_id_regex must contain a named group 'sample_type'." + ) return value @@ -61,29 +65,52 @@ def tumor_sample_types_callback(value: str) -> str: """ if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value): raise typer.BadParameter( - "tumor_sample_types must be comma-separated list of single letters.") + "tumor_sample_types must be comma-separated list of single letters." + ) return value @app.command() def update_small_variant_vcf_list( - results_dir: Annotated[Path | None, typer.Option(help="Directory where the results of the latest TSO500 run are stored.")], - glob_pattern: Annotated[str, typer.Option( - help="Glob pattern to search for small variant VCF files in the results directory.", callback=glob_pattern_callback)] = "**/Results/**/*_MergedSmallVariants.genome.vcf", - inpred_id_regex: Annotated[str, typer.Option( - help="Regular expression to extract the inpred_id from the VCF file name.", callback=inpred_id_regex_callback)] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", - output: Annotated[str, typer.Option( - help="Name of new small variant VCF list.")] = f"small_variant_vcf_list_.tsv", - tumor_sample_types: Annotated[str, typer.Option( - help="Comma-separated list of sample types that are considered tumor samples.")] = "C,D,d,L,M,P,p,R,r,T,X", - vcf_list: Annotated[Path | None, typer.Option( - help="Path to list of small variant VCF files.")] = None, + results_dir: Annotated[ + Path | None, + typer.Option( + help="Directory where the results of the latest TSO500 run are stored." + ), + ], + glob_pattern: Annotated[ + str, + typer.Option( + help="Glob pattern to search for small variant VCF files in the results directory.", + callback=glob_pattern_callback, + ), + ] = "**/Results/**/*_MergedSmallVariants.genome.vcf", + inpred_id_regex: Annotated[ + str, + typer.Option( + help="Regular expression to extract the inpred_id from the VCF file name.", + callback=inpred_id_regex_callback, + ), + ] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + output: Annotated[ + str, typer.Option(help="Name of new small variant VCF list.") + ] = "small_variant_vcf_list_.tsv", + tumor_sample_types: Annotated[ + str, + typer.Option( + help="Comma-separated list of sample types that are considered tumor samples." + ), + ] = "C,D,d,L,M,P,p,R,r,T,X", + vcf_list: Annotated[ + Path | None, typer.Option(help="Path to list of small variant VCF files.") + ] = None, ): """ Updates the small variant VCF list based on VCF(s) in results directory. """ logger.info("Start updating small variant VCF list.") small_variant_vcf_list = VcfList( - results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output) + results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output + ) small_variant_vcf_list.update() logger.info("Finished updating small variant VCF list.") diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index b906a18..d5428e5 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -27,9 +27,18 @@ class VcfList: vcf_list_columns (list[str]): List of dataframe column names. vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. """ + vcf_list_columns = ["vcf", "sample_type"] - def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, inpred_id_regex: str, tumor_sample_types: str, output: str): + def __init__( + self, + results_dir: Path, + glob_pattern: str, + vcf_list: Path | None, + inpred_id_regex: str, + tumor_sample_types: str, + output: str, + ): """ Create new instance of SmallVariantVcfList. """ @@ -42,13 +51,14 @@ def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, if vcf_list: try: self.dataframe = pandas.read_csv( - vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines='warn') + vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn" + ) except FileNotFoundError: logger.warning( - f"{vcf_list} not found, creating new small variant VCF list.") + f"{vcf_list} not found, creating new small variant VCF list." + ) else: - logger.info( - f"no small variant VCF list specified, creating new one.") + logger.info("no small variant VCF list specified, creating new one.") # Replace placeholder with actual date if "" in output: @@ -80,11 +90,9 @@ def update(self): # Loop over all small variant VCFs for vcf in self.vcfs: - # Avoid duplication if vcf in self.dataframe["vcf"].values: - logger.warning( - f"{vcf} is already in small variant VCF list, skipping.") + logger.warning(f"{vcf} is already in small variant VCF list, skipping.") continue # Parse InPreD ID to get patient ID and sample type @@ -93,29 +101,29 @@ def update(self): patient_id = match.group("patient_id") sample_type = match.group("sample_type") except AttributeError: - logger.warning( - f"could not parse InPreD ID from {vcf}, skipping.") + logger.warning(f"could not parse InPreD ID from {vcf}, skipping.") continue # Check if VCF is eligible for small variant VCF list and add if yes small_variant_vcf = Vcf( - vcf, patient_id, sample_type, self.tumor_sample_types) + vcf, patient_id, sample_type, self.tumor_sample_types + ) if not small_variant_vcf.include: continue else: - self.dataframe.loc[len(self.dataframe) - ] = small_variant_vcf.row() + self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() # Check if new patient ID is represented multiple times - patient_sample_count = self.dataframe["vcf"].str.contains( - patient_id).sum() + patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum() if patient_sample_count > 1: logger.warning( - f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list.") + f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." + ) # Write updated small variant VCF list to file self.dataframe.drop_duplicates().to_csv( - self.output, sep="\t", header=False, index=False) + self.output, sep="\t", header=False, index=False + ) class Vcf: @@ -128,9 +136,12 @@ class Vcf: sample_type (str): Single letter code representing type of sample, e.g. T = tumor. vcf (str): Path to VCF file. """ + include = True - def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set): + def __init__( + self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set + ): """ Create new instance of SmallVariantVcf. """ @@ -147,13 +158,15 @@ def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_typ if sample_type != "T" and sample_type != "N": if sample_type not in tumor_sample_types: logger.warning( - f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping.") + f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping." + ) self.include = False return else: # Reset any sample type in tumor_sample_types with T logger.warning( - f"sample type code {sample_type} for {vcf} will be replaced with T") + f"sample type code {sample_type} for {vcf} will be replaced with T" + ) self.sample_type = "T" else: self.sample_type = sample_type diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index c069cfe..dde4b56 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -1,6 +1,7 @@ """ update small variant vcf list subpackage main module unit tests. """ + import filecmp import os import unittest @@ -14,7 +15,9 @@ # test constants glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" -inpred_id_regex = "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" +inpred_id_regex = ( + "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" +) class TestVcfList(unittest.TestCase): @@ -22,60 +25,108 @@ def test_update(self): test_cases = [ { "name": "successfully update small variant vcf list", - "results_dir": path.join(test_data_dir, "successfully_update_small_variant_vcf_list"), + "results_dir": path.join( + test_data_dir, "successfully_update_small_variant_vcf_list" + ), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv", + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), }, { "name": "create new small variant vcf list", - "results_dir": path.join(test_data_dir, "create_new_small_variant_vcf_list"), + "results_dir": path.join( + test_data_dir, "create_new_small_variant_vcf_list" + ), "glob_pattern": glob_pattern, "vcf_list": None, "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), }, { "name": "small variant vcf list does not exist", - "results_dir": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist"), + "results_dir": path.join( + test_data_dir, "small_variant_vcf_list_does_not_exist" + ), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv", + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv", + ), }, { "name": "skip existing vcf", "results_dir": path.join(test_data_dir, "skip_existing_vcf"), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv" + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv" + ), }, { "name": "inpred id not parsable", "results_dir": path.join(test_data_dir, "inpred_id_not_parsable"), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv" + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv" + ), }, ] for test_case in test_cases: with self.subTest(msg=test_case["name"]): got = VcfList( - test_case["results_dir"], test_case["glob_pattern"], test_case["vcf_list"], test_case["inpred_id_regex"], test_case["tumor_sample_types"], test_case["output"]) + test_case["results_dir"], + test_case["glob_pattern"], + test_case["vcf_list"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + test_case["output"], + ) got.update() assert filecmp.cmp(test_case["output"], test_case["expected"]) os.remove(test_case["output"]) @@ -113,7 +164,11 @@ def test_init(self): for test_case in test_cases: with self.subTest(msg=test_case["name"]): got = Vcf( - test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + test_case["vcf"], + test_case["patient_id"], + test_case["sample_type"], + test_case["tumor_sample_types"], + ) assert got.include == test_case["expected"] def test_row(self): @@ -131,6 +186,10 @@ def test_row(self): for test_case in test_cases: with self.subTest(msg=test_case["name"]): vcf = Vcf( - test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + test_case["vcf"], + test_case["patient_id"], + test_case["sample_type"], + test_case["tumor_sample_types"], + ) got = vcf.row() assert got == test_case["expected"] From 35a377429bf45ecd71b65f62aa540af9271bf21b Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 09:22:56 +0200 Subject: [PATCH 09/20] style: make clear that N stands for normal --- src/tsoppy/update_small_variant_vcf_list/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index d5428e5..e8453a5 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -154,11 +154,11 @@ def __init__( self.include = False return - # Ensure sample type is N or included in tumor_sample_types + # Ensure sample type is N(ormal) or included in tumor_sample_types if sample_type != "T" and sample_type != "N": if sample_type not in tumor_sample_types: logger.warning( - f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping." + f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N(ormal), skipping." ) self.include = False return From d0ba93b8e75e18968159fb7f7beb224276c44136 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 14:33:10 +0200 Subject: [PATCH 10/20] feat: move patient_id and sample_type parsing into Vcf class --- .../update_small_variant_vcf_list/main.py | 99 ++++++++++--------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index e8453a5..ff00144 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -15,6 +15,19 @@ logger = logging.getLogger(__name__) +class InvalidSampleType(Exception): + """ + Exception if sample type is not valid. + """ + + def __init__(self, msg="sample type is not valid"): + self.msg = msg + super().__init__(self.msg) + + def __str__(self): + return self.msg + + class VcfList: """ Represents small variant VCF list. @@ -90,34 +103,44 @@ def update(self): # Loop over all small variant VCFs for vcf in self.vcfs: + # Try to create vcf class instance + try: + small_variant_vcf = Vcf( + vcf, self.inpred_id_regex, self.tumor_sample_types + ) + except AttributeError: + logger.warning( + f"could not parse InPreD ID from {small_variant_vcf.vcf}, skipping." + ) + continue + except InvalidSampleType: + logger.warning( + f"{small_variant_vcf.vcf} has sample type {small_variant_vcf.sample_type} which is not {self.tumor_sample_types} or N(ormal), skipping." + ) + continue + # Avoid duplication - if vcf in self.dataframe["vcf"].values: + if small_variant_vcf.vcf in self.dataframe["vcf"].values: logger.warning(f"{vcf} is already in small variant VCF list, skipping.") continue - # Parse InPreD ID to get patient ID and sample type - match = re.search(self.inpred_id_regex, vcf) - try: - patient_id = match.group("patient_id") - sample_type = match.group("sample_type") - except AttributeError: - logger.warning(f"could not parse InPreD ID from {vcf}, skipping.") + # Exclude control samples + if small_variant_vcf.patient_id.startswith("IPC"): + logger.warning( + f"{small_variant_vcf.patient_id} is a control sample, skipping." + ) continue - # Check if VCF is eligible for small variant VCF list and add if yes - small_variant_vcf = Vcf( - vcf, patient_id, sample_type, self.tumor_sample_types - ) - if not small_variant_vcf.include: - continue - else: - self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() + # Add vcf to list + self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() # Check if new patient ID is represented multiple times - patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum() + patient_sample_count = ( + self.dataframe["vcf"].str.contains(small_variant_vcf.patient_id).sum() + ) if patient_sample_count > 1: logger.warning( - f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." + f"patient {small_variant_vcf.patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." ) # Write updated small variant VCF list to file @@ -131,45 +154,35 @@ class Vcf: Represents small variant VCF. Attributes: - include (bool): Whether to add the vcf to the small variant VCF file or not. patient_id (str): ID of patient that the VCF belongs to. sample_type (str): Single letter code representing type of sample, e.g. T = tumor. vcf (str): Path to VCF file. """ - include = True - - def __init__( - self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set - ): + def __init__(self, vcf: str, inpred_id_regex: str, tumor_sample_types: set): """ Create new instance of SmallVariantVcf. """ self.vcf = vcf - self.patient_id = patient_id - - # Exclude control sample starting with IPC - if patient_id.startswith("IPC"): - logger.warning(f"{self.patient_id} is a control sample, skipping.") - self.include = False - return - # Ensure sample type is N(ormal) or included in tumor_sample_types - if sample_type != "T" and sample_type != "N": - if sample_type not in tumor_sample_types: - logger.warning( - f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N(ormal), skipping." - ) - self.include = False - return + # Parse InPreD ID to get patient ID and sample type + match = re.search(inpred_id_regex, self.vcf) + try: + self.patient_id = match.group("patient_id") + self.sample_type = match.group("sample_type") + except AttributeError: + raise AttributeError + + # Validate sample type is N(ormal) or included in tumor_sample_types + if self.sample_type != "N": + if self.sample_type not in tumor_sample_types: + raise InvalidSampleType else: # Reset any sample type in tumor_sample_types with T logger.warning( - f"sample type code {sample_type} for {vcf} will be replaced with T" + f"sample type code {self.sample_type} for {self.vcf} will be replaced with T" ) self.sample_type = "T" - else: - self.sample_type = sample_type def __eq__(self, other): """ @@ -177,8 +190,6 @@ def __eq__(self, other): """ if not isinstance(other, Vcf): return NotImplemented - if self.include != other.include: - return False if self.patient_id != other.patient_id: return False if self.sample_type != other.sample_type: From 6cdec52f2beaf53628e42470a886e25a43f55c6a Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 14:33:27 +0200 Subject: [PATCH 11/20] docs: update module description --- src/tsoppy/update_small_variant_vcf_list/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index ff00144..1cf9151 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -1,6 +1,10 @@ """ -This module contains the code for the `update_small_variant_vcf_list` command. -The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored. +This module defines the classes 'VcfList' and 'Vcf'. +'VcfList' takes a directory holding TSO500 results, a glob to identify small variant vcf files, the currect small variant vcf list, +a regular expression matching InPreD IDs, a set of tumor sample types and the path to the new small variant vcf list. +'VcfList' has a method to update the current small variant vcf list with vcfs found in the TSO500 results directory. +'Vcf' is defined by a path to a small variant vcf and a set of tumor sample types. +'Vcf' provides a method to create a new row for a pandas dataframe. """ import glob From ae145864cc40991078f1f51338e0798fd9d0e172 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 14:34:05 +0200 Subject: [PATCH 12/20] test: update unit tests to reflect new Vcf class definition --- pyproject.toml | 2 +- ...01-N01-A01_MergedSmallVariants.genome.vcf} | 0 .../TSO500_vcf_list.tsv | 0 .../TSO500_vcf_list_expected.tsv | 0 ...test_update_small_variant_vcf_list_main.py | 64 +++++++++++-------- 5 files changed, 37 insertions(+), 29 deletions(-) rename tests/test_data/update_small_variant_vcf_list_main/{inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf => sample_is_control/LocalApp/Results/IPC0002-D01-N01-A01/IPC0002-D01-N01-A01_MergedSmallVariants.genome.vcf} (100%) rename tests/test_data/update_small_variant_vcf_list_main/{inpred_id_not_parsable => sample_is_control}/TSO500_vcf_list.tsv (100%) rename tests/test_data/update_small_variant_vcf_list_main/{inpred_id_not_parsable => sample_is_control}/TSO500_vcf_list_expected.tsv (100%) diff --git a/pyproject.toml b/pyproject.toml index 90e9a51..84ebc27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ requires-python = ">=3.14" tsoppy = "tsoppy.cli:app" [project.optional-dependencies] -dev = ["isort==8.0.1", "ruff==0.15.6"] +dev = ["isort==8.0.1", "pytest==9.0.2", "ruff==0.15.6"] lint = ["ruff==0.15.6"] test = ["pytest==9.0.2", "pytest-emoji==0.2.0", "pytest-md==0.2.0"] diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/LocalApp/Results/IPC0002-D01-N01-A01/IPC0002-D01-N01-A01_MergedSmallVariants.genome.vcf similarity index 100% rename from tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf rename to tests/test_data/update_small_variant_vcf_list_main/sample_is_control/LocalApp/Results/IPC0002-D01-N01-A01/IPC0002-D01-N01-A01_MergedSmallVariants.genome.vcf diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list.tsv similarity index 100% rename from tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv rename to tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list.tsv diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list_expected.tsv similarity index 100% rename from tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv rename to tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list_expected.tsv diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index dde4b56..73da1e0 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -5,9 +5,13 @@ import filecmp import os import unittest +from contextlib import nullcontext from os import path -from tsoppy.update_small_variant_vcf_list.main import Vcf, VcfList +import pytest + +from tsoppy.update_small_variant_vcf_list.main import (InvalidSampleType, Vcf, + VcfList) # Define path to test data - cannot be absolute due to different paths locally and in CI test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" @@ -100,19 +104,19 @@ def test_update(self): ), }, { - "name": "inpred id not parsable", - "results_dir": path.join(test_data_dir, "inpred_id_not_parsable"), + "name": "sample is control", + "results_dir": path.join(test_data_dir, "sample_is_control"), "glob_pattern": glob_pattern, "vcf_list": path.join( - test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv" + test_data_dir, "sample_is_control/TSO500_vcf_list.tsv" ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, "output": path.join( - test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv" + test_data_dir, "sample_is_control/TSO500_vcf_list_updated.tsv" ), "expected": path.join( - test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv" + test_data_dir, "sample_is_control/TSO500_vcf_list_expected.tsv" ), }, ] @@ -137,49 +141,54 @@ def test_init(self): test_cases = [ { "name": "include sample", - "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "vcf": "IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "exception": nullcontext(), "patient_id": "IPH0001", "sample_type": "T", - "tumor_sample_types": tumor_sample_types, - "expected": True, }, { - "name": "sample is control", - "vcf": "IPC0001-01-T01-01_MergedSmallVariants.genome.vcf", - "patient_id": "IPC0001", - "sample_type": "T", + "name": "inpred id is not parsable", + "vcf": "IPH0001D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "expected": False, + "exception": pytest.raises(AttributeError), + "patient_id": None, + "sample_type": None, }, { "name": "sample is neither tumor nor normal", - "vcf": "IPH0001-01-A01-01_MergedSmallVariants.genome.vcf", + "vcf": "IPH0001-D01-A01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "exception": pytest.raises(InvalidSampleType), "patient_id": "IPH0001", "sample_type": "A", - "tumor_sample_types": tumor_sample_types, - "expected": False, }, ] for test_case in test_cases: with self.subTest(msg=test_case["name"]): - got = Vcf( - test_case["vcf"], - test_case["patient_id"], - test_case["sample_type"], - test_case["tumor_sample_types"], - ) - assert got.include == test_case["expected"] + with test_case["exception"]: + got = Vcf( + test_case["vcf"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + ) + assert got.patient_id == test_case["patient_id"] + assert got.sample_type == test_case["sample_type"] def test_row(self): test_cases = [ { "name": "successfully return row", - "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "vcf": "IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, "patient_id": "IPH0001", "sample_type": "T", "tumor_sample_types": tumor_sample_types, - "expected": ["IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", "T"], + "expected": ["IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", "T"], }, ] @@ -187,8 +196,7 @@ def test_row(self): with self.subTest(msg=test_case["name"]): vcf = Vcf( test_case["vcf"], - test_case["patient_id"], - test_case["sample_type"], + test_case["inpred_id_regex"], test_case["tumor_sample_types"], ) got = vcf.row() From 4f245c5018c92e9ec2de46520e870877b4b4a997 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 14:37:24 +0200 Subject: [PATCH 13/20] chore: lint --- tests/test_update_small_variant_vcf_list_main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index 73da1e0..9cdc38c 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -10,8 +10,7 @@ import pytest -from tsoppy.update_small_variant_vcf_list.main import (InvalidSampleType, Vcf, - VcfList) +from tsoppy.update_small_variant_vcf_list.main import InvalidSampleType, Vcf, VcfList # Define path to test data - cannot be absolute due to different paths locally and in CI test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" From 954286bb499af88d7f75b9463830d27ea957442e Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 15:48:14 +0200 Subject: [PATCH 14/20] feat: switch from pandas to polars to improve performance --- pyproject.toml | 2 +- .../update_small_variant_vcf_list/main.py | 31 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 84ebc27..69dc879 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"] build-backend = "setuptools.build_meta" [project] -dependencies = ["pandas>=3.0.1", "typer>=0.24.1"] +dependencies = ["polars>=1.39.3", "typer>=0.24.1"] dynamic = ["version"] name = "tsoppy" requires-python = ">=3.14" diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index 1cf9151..6065c2d 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -13,7 +13,7 @@ from datetime import datetime from pathlib import Path -import pandas +import polars # Use logger that was set up in CLI logger = logging.getLogger(__name__) @@ -42,10 +42,10 @@ class VcfList: output (str): Path to updated version of small variant VCF list. tumor_sample_types (set[str]): Single letter codes representing a tumor sample. vcf_list_columns (list[str]): List of dataframe column names. - vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. + vcfs (dict): Small variant VCF(s) located in TSO500 results directory. """ - vcf_list_columns = ["vcf", "sample_type"] + vcf_list_columns = {"vcf": polars.String, "sample_type": polars.String} def __init__( self, @@ -62,13 +62,18 @@ def __init__( self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}") self.inpred_id_regex = rf"{inpred_id_regex}" self.tumor_sample_types = set(tumor_sample_types.split(",")) - self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns) + self.dataframe = polars.DataFrame(schema=self.vcf_list_columns) # Try reading small variant VCF list or start from scratch if vcf_list: try: - self.dataframe = pandas.read_csv( - vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn" + self.dataframe = polars.read_csv( + source=vcf_list, + separator="\t", + schema=self.vcf_list_columns, + ignore_errors=True, + has_header=False, + raise_if_empty=False, ) except FileNotFoundError: logger.warning( @@ -124,7 +129,7 @@ def update(self): continue # Avoid duplication - if small_variant_vcf.vcf in self.dataframe["vcf"].values: + if small_variant_vcf.vcf in self.dataframe["vcf"].to_list(): logger.warning(f"{vcf} is already in small variant VCF list, skipping.") continue @@ -136,20 +141,20 @@ def update(self): continue # Add vcf to list - self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() + self.dataframe = polars.concat([self.dataframe, small_variant_vcf.row()]) # Check if new patient ID is represented multiple times patient_sample_count = ( - self.dataframe["vcf"].str.contains(small_variant_vcf.patient_id).sum() - ) + self.dataframe["vcf"] == small_variant_vcf.patient_id + ).sum() if patient_sample_count > 1: logger.warning( f"patient {small_variant_vcf.patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." ) # Write updated small variant VCF list to file - self.dataframe.drop_duplicates().to_csv( - self.output, sep="\t", header=False, index=False + self.dataframe.unique().write_csv( + file=self.output, separator="\t", include_header=False ) @@ -204,4 +209,4 @@ def row(self): """ Return small variant VCF list row. """ - return [self.vcf, self.sample_type] + return polars.DataFrame({"vcf": [self.vcf], "sample_type": [self.sample_type]}) From 6585e512f3b436fe611c9a86afb25b300ae59597 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Thu, 9 Apr 2026 15:48:33 +0200 Subject: [PATCH 15/20] test: adapt tests to using polars --- tests/test_update_small_variant_vcf_list_main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index 9cdc38c..e912965 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -8,6 +8,7 @@ from contextlib import nullcontext from os import path +import polars import pytest from tsoppy.update_small_variant_vcf_list.main import InvalidSampleType, Vcf, VcfList @@ -187,7 +188,12 @@ def test_row(self): "patient_id": "IPH0001", "sample_type": "T", "tumor_sample_types": tumor_sample_types, - "expected": ["IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", "T"], + "expected": polars.DataFrame( + { + "vcf": ["IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf"], + "sample_type": ["T"], + } + ), }, ] @@ -199,4 +205,4 @@ def test_row(self): test_case["tumor_sample_types"], ) got = vcf.row() - assert got == test_case["expected"] + assert got.equals(test_case["expected"]) From d7031294cb3b6b3917af9db21d7c397877e60579 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 13 Apr 2026 13:25:43 +0200 Subject: [PATCH 16/20] fix: make regex a raw string --- src/tsoppy/cli.py | 2 +- tests/test_update_small_variant_vcf_list_main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 8dabc70..c8585e8 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -91,7 +91,7 @@ def update_small_variant_vcf_list( help="Regular expression to extract the inpred_id from the VCF file name.", callback=inpred_id_regex_callback, ), - ] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + ] = r"(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", output: Annotated[ str, typer.Option(help="Name of new small variant VCF list.") ] = "small_variant_vcf_list_.tsv", diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index e912965..638c964 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -20,7 +20,7 @@ glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" inpred_id_regex = ( - "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" + r"(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$" ) From f75043457a34d36c1506853675e37dfd66910c31 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 13 Apr 2026 13:26:30 +0200 Subject: [PATCH 17/20] fix: use polars equals to compare dataframes --- src/tsoppy/update_small_variant_vcf_list/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index 6065c2d..65aa424 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -95,7 +95,7 @@ def __eq__(self, other): """ if not isinstance(other, VcfList): return NotImplemented - if self.dataframe != other.dataframe: + if not self.dataframe.equals(other.dataframe): return False if self.inpred_id_regex != other.inpred_id_regex: return False From 982ebd7e562f0bb5020c76c8815e87b97fca4623 Mon Sep 17 00:00:00 2001 From: Martin Rippin <74295098+marrip@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:03:03 +0200 Subject: [PATCH 18/20] feat: rm None as input type for results_dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: HÃ¥vard Molversmyr <54852797+molversmyr@users.noreply.github.com> --- src/tsoppy/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index c8585e8..5ed63b0 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -73,7 +73,7 @@ def tumor_sample_types_callback(value: str) -> str: @app.command() def update_small_variant_vcf_list( results_dir: Annotated[ - Path | None, + Path, typer.Option( help="Directory where the results of the latest TSO500 run are stored." ), From 68c849f639ebd85036822ad6b422add0e7ff12fb Mon Sep 17 00:00:00 2001 From: Martin Rippin <74295098+marrip@users.noreply.github.com> Date: Fri, 12 Jun 2026 08:25:51 +0200 Subject: [PATCH 19/20] docs: apply @danielvo 's suggestion Co-authored-by: danielvo <7126118+danielvo@users.noreply.github.com> --- src/tsoppy/update_small_variant_vcf_list/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index 65aa424..7e62417 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -4,7 +4,7 @@ a regular expression matching InPreD IDs, a set of tumor sample types and the path to the new small variant vcf list. 'VcfList' has a method to update the current small variant vcf list with vcfs found in the TSO500 results directory. 'Vcf' is defined by a path to a small variant vcf and a set of tumor sample types. -'Vcf' provides a method to create a new row for a pandas dataframe. +'Vcf' provides a method to create a new row for a polars dataframe. """ import glob From b17acddbca466050f539afb3adc7623779c6c4a7 Mon Sep 17 00:00:00 2001 From: Martin Rippin <74295098+marrip@users.noreply.github.com> Date: Fri, 12 Jun 2026 08:39:02 +0200 Subject: [PATCH 20/20] fix: correct __eq__ function for both classes according to @danielvo 's suggestion Co-authored-by: Martin Rippin <74295098+marrip@users.noreply.github.com> --- src/tsoppy/update_small_variant_vcf_list/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index 7e62417..b023e05 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -103,7 +103,7 @@ def __eq__(self, other): return False if self.tumor_sample_types != other.tumor_sample_types: return False - return self.vcfs != other.vcfs + return self.vcfs == other.vcfs def update(self): """ @@ -203,7 +203,7 @@ def __eq__(self, other): return False if self.sample_type != other.sample_type: return False - return self.vcf != other.vcf + return self.vcf == other.vcf def row(self): """