Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"]
build-backend = "setuptools.build_meta"

[project]
dependencies = ["typer>=0.24.1"]
dependencies = ["polars>=1.39.3", "typer>=0.24.1"]
dynamic = ["version"]
name = "tsoppy"
requires-python = ">=3.14"
Expand All @@ -12,7 +12,7 @@ requires-python = ">=3.14"
tsoppy = "tsoppy.cli:app"

[project.optional-dependencies]
dev = ["isort==8.0.1", "ruff==0.15.6"]
dev = ["isort==8.0.1", "pytest==9.0.2", "ruff==0.15.6"]
lint = ["ruff==0.15.6"]
test = ["pytest==9.0.2", "pytest-emoji==0.2.0", "pytest-md==0.2.0"]

Expand Down
101 changes: 91 additions & 10 deletions src/tsoppy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,27 @@
"""

import importlib.metadata
import logging
import re
from pathlib import Path
from typing import Annotated

import typer

from tsoppy.update_small_variant_vcf_list.main import VcfList

# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y/%m/%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

# Create a Typer app for the CLI. The app will be used to define the commands and their arguments.
app = typer.Typer()

# app_version will be set from git tag.
app_version = importlib.metadata.version("tsoppy")


Expand All @@ -19,17 +35,82 @@ def version():
print(f"tsoppy version {app_version}")


def glob_pattern_callback(value: str) -> str:
"""
Callback function checking that the glob pattern ends with '.vcf'.
"""
if not value.endswith(".vcf"):
raise typer.BadParameter("Glob pattern must end with '.vcf'.")
return value


def inpred_id_regex_callback(value: str) -> str:
"""
Callback function ensuring inpred_id_regex contains the required named capture groups.
"""
if "<patient_id>" not in value:
raise typer.BadParameter(
"inpred_id_regex must contain a named group 'patient_id'."
)
if "<sample_type>" not in value:
raise typer.BadParameter(
"inpred_id_regex must contain a named group 'sample_type'."
)
return value


def tumor_sample_types_callback(value: str) -> str:
"""
Callback function to ensure tumor_sample_types is a comma-separated list of single letters.
"""
if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value):
raise typer.BadParameter(
"tumor_sample_types must be comma-separated list of single letters."
)
return value


@app.command()
def placeholder(
user_name: Annotated[str, typer.Option("--name", "-n")],
user_id: Annotated[str, typer.Option("--id", "-i")],
verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
def update_small_variant_vcf_list(
results_dir: Annotated[
Path,
typer.Option(
help="Directory where the results of the latest TSO500 run are stored."
),
],
glob_pattern: Annotated[
str,
typer.Option(
help="Glob pattern to search for small variant VCF files in the results directory.",
callback=glob_pattern_callback,
),
] = "**/Results/**/*_MergedSmallVariants.genome.vcf",

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have all these values in config files? The Dragen and LocalApp pipelines will need own config files with different paths.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we need those in a config file, I have started on one in #30, let me know if that makes sense. Once we have the input classes settled I will rework this PR to use them.

inpred_id_regex: Annotated[
str,
typer.Option(
help="Regular expression to extract the inpred_id from the VCF file name.",
callback=inpred_id_regex_callback,
),
] = r"(?P<patient_id>\D{3}\d{4})-\D\d{2}-(?P<sample_type>\D)\d{2}-\D\d{2}.*.vcf$",
output: Annotated[
str, typer.Option(help="Name of new small variant VCF list.")
] = "small_variant_vcf_list_<YYYYMMDD>.tsv",
tumor_sample_types: Annotated[
str,
typer.Option(
help="Comma-separated list of sample types that are considered tumor samples."
),
] = "C,D,d,L,M,P,p,R,r,T,X",
vcf_list: Annotated[
Path | None, typer.Option(help="Path to list of small variant VCF files.")
] = None,
):
"""
This is the helptext for the placeholder command that demonstrates how to
use Typer for CLI applications.
Updates the small variant VCF list based on VCF(s) in results directory.
"""
if verbose:
print(f"{user_name} has the following id: {user_id}")
else:
print(f"{user_name}: {user_id}")
logger.info("Start updating small variant VCF list.")
small_variant_vcf_list = VcfList(
results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output
)
small_variant_vcf_list.update()
logger.info("Finished updating small variant VCF list.")
Empty file.
212 changes: 212 additions & 0 deletions src/tsoppy/update_small_variant_vcf_list/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
This module defines the classes 'VcfList' and 'Vcf'.
'VcfList' takes a directory holding TSO500 results, a glob to identify small variant vcf files, the currect small variant vcf list,
a regular expression matching InPreD IDs, a set of tumor sample types and the path to the new small variant vcf list.
'VcfList' has a method to update the current small variant vcf list with vcfs found in the TSO500 results directory.
'Vcf' is defined by a path to a small variant vcf and a set of tumor sample types.
'Vcf' provides a method to create a new row for a polars dataframe.
"""

import glob
import logging
import re
from datetime import datetime
from pathlib import Path

import polars

# Use logger that was set up in CLI
logger = logging.getLogger(__name__)


class InvalidSampleType(Exception):
"""
Exception if sample type is not valid.
"""

def __init__(self, msg="sample type is not valid"):
self.msg = msg
super().__init__(self.msg)

def __str__(self):
return self.msg


class VcfList:
"""
Represents small variant VCF list.

Attributes:
dataframe (Dataframe): Dataframe representing the current version of small variant VCF list.
inpred_id_regex (str): Regular expression matching InPreD IDs.
output (str): Path to updated version of small variant VCF list.
tumor_sample_types (set[str]): Single letter codes representing a tumor sample.
vcf_list_columns (list[str]): List of dataframe column names.
vcfs (dict): Small variant VCF(s) located in TSO500 results directory.
"""

vcf_list_columns = {"vcf": polars.String, "sample_type": polars.String}

def __init__(
self,
results_dir: Path,
glob_pattern: str,
vcf_list: Path | None,
inpred_id_regex: str,
tumor_sample_types: str,
output: str,
):
"""
Create new instance of SmallVariantVcfList.
"""
self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}")
self.inpred_id_regex = rf"{inpred_id_regex}"
self.tumor_sample_types = set(tumor_sample_types.split(","))
self.dataframe = polars.DataFrame(schema=self.vcf_list_columns)

# Try reading small variant VCF list or start from scratch
if vcf_list:
try:
self.dataframe = polars.read_csv(
source=vcf_list,
separator="\t",
schema=self.vcf_list_columns,
ignore_errors=True,
has_header=False,
raise_if_empty=False,
)
except FileNotFoundError:
logger.warning(
f"{vcf_list} not found, creating new small variant VCF list."
)
else:
logger.info("no small variant VCF list specified, creating new one.")

# Replace placeholder with actual date
if "<YYYYMMDD>" in output:
now = datetime.now()
self.output = output.replace("<YYYYMMDD>", now.strftime("%Y%m%d"))
else:
self.output = output

def __eq__(self, other):
"""
Compare to other class instance.
"""
if not isinstance(other, VcfList):
return NotImplemented
if not self.dataframe.equals(other.dataframe):
return False
if self.inpred_id_regex != other.inpred_id_regex:
return False
if self.output != other.output:
return False
if self.tumor_sample_types != other.tumor_sample_types:
return False
return self.vcfs == other.vcfs

def update(self):
"""
Add VCF(s) from results directory to small variant VCF list.
"""

# Loop over all small variant VCFs
for vcf in self.vcfs:
# Try to create vcf class instance
try:
small_variant_vcf = Vcf(
vcf, self.inpred_id_regex, self.tumor_sample_types
)
except AttributeError:
logger.warning(
f"could not parse InPreD ID from {small_variant_vcf.vcf}, skipping."
)
continue
except InvalidSampleType:
logger.warning(
f"{small_variant_vcf.vcf} has sample type {small_variant_vcf.sample_type} which is not {self.tumor_sample_types} or N(ormal), skipping."
)
continue

# Avoid duplication
if small_variant_vcf.vcf in self.dataframe["vcf"].to_list():
logger.warning(f"{vcf} is already in small variant VCF list, skipping.")
continue

# Exclude control samples
if small_variant_vcf.patient_id.startswith("IPC"):
logger.warning(
f"{small_variant_vcf.patient_id} is a control sample, skipping."
)
continue

# Add vcf to list
self.dataframe = polars.concat([self.dataframe, small_variant_vcf.row()])

# Check if new patient ID is represented multiple times
patient_sample_count = (
self.dataframe["vcf"] == small_variant_vcf.patient_id
).sum()
if patient_sample_count > 1:
logger.warning(
f"patient {small_variant_vcf.patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list."
)

# Write updated small variant VCF list to file
self.dataframe.unique().write_csv(
file=self.output, separator="\t", include_header=False
)


class Vcf:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we select a different name here, considering that we also have the Vcf(BaseInput) class? This object represents a file path (which happens to also inform about the patient and tumor type), while the other VCF object intends to track the VCF content/variants. Would VCF_path be a good choice here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I think we need to rename and restructure a good deal to fit our newly implemented classes.

"""
Represents small variant VCF.

Attributes:
patient_id (str): ID of patient that the VCF belongs to.
sample_type (str): Single letter code representing type of sample, e.g. T = tumor.
vcf (str): Path to VCF file.
"""

def __init__(self, vcf: str, inpred_id_regex: str, tumor_sample_types: set):
"""
Create new instance of SmallVariantVcf.
"""
self.vcf = vcf

# Parse InPreD ID to get patient ID and sample type
match = re.search(inpred_id_regex, self.vcf)
try:
self.patient_id = match.group("patient_id")
self.sample_type = match.group("sample_type")
except AttributeError:
raise AttributeError

# Validate sample type is N(ormal) or included in tumor_sample_types
if self.sample_type != "N":
if self.sample_type not in tumor_sample_types:
raise InvalidSampleType
else:
# Reset any sample type in tumor_sample_types with T
logger.warning(
f"sample type code {self.sample_type} for {self.vcf} will be replaced with T"
)
self.sample_type = "T"

def __eq__(self, other):
"""
Compare to other class instance.
"""
if not isinstance(other, Vcf):
return NotImplemented
if self.patient_id != other.patient_id:
return False
if self.sample_type != other.sample_type:
return False
return self.vcf == other.vcf

def row(self):
"""
Return small variant VCF list row.
"""
return polars.DataFrame({"vcf": [self.vcf], "sample_type": [self.sample_type]})
10 changes: 0 additions & 10 deletions tests/cli_test.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Loading