Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
985d65d
feat: move parser to file_parser module and change behavior to checki…
marrip Jun 3, 2026
d4e3fb4
fix: don't assume key value store but have user define it
marrip Jun 3, 2026
93f331b
chore: ruff lint
marrip Jun 3, 2026
bd0737b
docs: update comment to make purpose clearer
marrip Jun 3, 2026
4a83af3
fix: add init.py to mark subpackage general
marrip Jun 5, 2026
60d28ee
test: add test_data and unittests for Parse_section_Tsv
marrip Jun 5, 2026
962e93a
feat: intercept file not found and no data and log
marrip Jun 5, 2026
a7a2712
test: add test cases and data for empty file and non-existent
marrip Jun 5, 2026
dbe533f
fix: correct output signature for _get_section_idx
marrip Jun 5, 2026
18dfbde
test: add unit tests for _get_section_idx
marrip Jun 5, 2026
56a10e2
test: add unit tests for _parse_headers and _handle_row_with_nulls
marrip Jun 5, 2026
f4898e9
chore: lint ruff and isort
marrip Jun 5, 2026
0b9fbd7
chore: lint ruff
marrip Jun 5, 2026
2a59e8d
chore: lint ruff
marrip Jun 5, 2026
ea01cf0
chore: install polars
marrip Jun 5, 2026
2b31aae
docs: update docstring with @danielvo 's suggestion
marrip Jun 8, 2026
016a5dd
docs: add short comments to describe test cases
marrip Jun 9, 2026
abd8898
chore: include pytest in dev deps
marrip Jun 9, 2026
86b33d9
docs: include description of Parse_section_tsv
marrip Jun 9, 2026
ac891ae
docs: include header and MLA citation for cyvcf paper
marrip Jun 9, 2026
0978c77
docs: correct docstring according to @danielvo 's suggestion
marrip Jun 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"]
build-backend = "setuptools.build_meta"

[project]
dependencies = ["cyvcf2>=0.32.1", "msgspec>=0.21.1", "typer>=0.24.1"]
dependencies = [
"cyvcf2>=0.32.1",
"msgspec>=0.21.1",
"polars>=1.39.3",
"typer>=0.24.1",
]
dynamic = ["version"]
name = "tsoppy"
requires-python = ">=3.14"
Expand Down
Empty file.
98 changes: 98 additions & 0 deletions src/tsoppy/general/file_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import logging
import re

import polars

# Use logger that was set up in CLI
logger = logging.getLogger(__name__)


def Parse_section_tsv(
path: str, key_value_sections: list[str]
) -> tuple[list[str], dict[str, polars.DataFrame]]:
"""Parse a sectioned TSV file into headers and a mapping of section names to DataFrames."""
try:
df = polars.read_csv(path, separator="\t", has_header=False)
except FileNotFoundError:
logger.error(f"File {path} not found.")
raise
except polars.exceptions.NoDataError:
logger.error(f"File {path} is empty.")
raise
section_idx = _get_section_idx(df)
headers = []
if section_idx[0][1] != 1:
headers = _parse_headers(df, section_idx[0][1] - 1)
section_dfs = {}
for section in section_idx:
# create slice of dataframe for the section
df_slice = df.slice(section[1], section[2])

# check if row contains null values
if any(item is None for item in df_slice.row(0)):
df_slice = _handle_row_with_nulls(df_slice)

# check if section is a key value section
if section[0] in key_value_sections:
# check that the section only contains two columns and transpose else log a warning
if df_slice.width == 2:
df_slice = df_slice.transpose()
else:
logger.warning(
f"Section {section[0]} is supposed to be a key value section but contains more than two columns."
)

# assume first row contains column names
df_header = df_slice.head(1).to_dicts().pop()

# remove first row and rename columns and link it to the section name
section_dfs[section[0]] = df_slice.rename(df_header).slice(1)
return headers, section_dfs


def _get_section_idx(df: polars.DataFrame) -> list[tuple[str, int, int]]:
"""Get the the name, start index and length of each section in the DataFrame."""
section = ""
section_start = 0
section_length = 0
section_idx = []
for row in df.with_row_index().iter_rows():
if row[1]:
match = re.search(r"^\[(?P<section>.*)\]$", row[1])
if match:
section_start = row[0] + 1
section = match.group("section")
if all(item is None for item in row[1:]):
section_length = row[0] - section_start
if section_start > 0 and section_length > 0:
section_idx.append((section, section_start, section_length))
section_start = 0
section_length = 0
if row[0] == len(df) - 1:
section_length = row[0] - section_start + 1
if section_start > 0 and section_length > 0:
section_idx.append((section, section_start, section_length))
return section_idx


def _parse_headers(df: polars.DataFrame, header_rows: int) -> list[str]:
"""Parse the first rows from the top of the DataFrame as headers."""
headers = []
for row in df.head(header_rows).rows():
for el in row:
if el is not None:
headers.append(el)
return headers


def _handle_row_with_nulls(df: polars.DataFrame) -> polars.DataFrame:
"""Handle rows with null values removing empty columns and by filling with "-"."""
Comment thread
marrip marked this conversation as resolved.
Outdated

# remove any columns that are completely null (no column header nor values)
df = df.select(
[polars.col(col) for col in df.columns if not df[col].null_count() == df.height]
)

# avoid null values by filling with "-"
df = df.with_columns(polars.all().cast(polars.String).fill_null("-"))
return df
266 changes: 266 additions & 0 deletions tests/general_file_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
from contextlib import nullcontext
from os import path

import polars
from pytest import mark, raises

from tsoppy.general.file_parser import (
Parse_section_tsv,
_get_section_idx,
_handle_row_with_nulls,
_parse_headers,
)

# Define path to test data - cannot be absolute due to different paths locally and in CI
test_data_dir = "tests/test_data/general"


@mark.parametrize(
"inputs, exception, want",
[
(
(path.join(test_data_dir, "parse_section_tsv/standard.tsv"), []),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
)
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/multiple_sections.tsv"), []),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
),
"section2": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
),
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/no_headers.tsv"), []),
nullcontext(),
(
[],
{
"section1": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
)
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/extra_empty_lines.tsv"), []),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
)
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/null_columns.tsv"), []),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
)
},
),
),
(
(
path.join(
test_data_dir, "parse_section_tsv/empty_first_column_name.tsv"
),
[],
),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"-": ["value1", "value2"], "col2": ["value3", "value4"]}
)
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/key_value.tsv"), ["section1"]),
nullcontext(),
(
["header1"],
{
"section1": polars.DataFrame(
{"key1": ["value1"], "key2": ["value2"]}
)
},
),
),
(
(path.join(test_data_dir, "parse_section_tsv/non-existent.tsv"), []),
raises(FileNotFoundError),
([], {}),
),
(
(path.join(test_data_dir, "parse_section_tsv/empty.tsv"), []),
raises(polars.exceptions.NoDataError),
([], {}),
),
],
)
def test_parse_section_tsv(inputs, exception, want):
with exception:
got = Parse_section_tsv(inputs[0], inputs[1])
assert got[0] == want[0]
for key in want[1].keys():
assert key in got[1]
assert got[1][key].equals(want[1][key])


@mark.parametrize(
"input, want",
[
(
polars.DataFrame(
{
"col1": ["[section1]", "col1", "value1"],
"col2": [None, "col2", "value2"],
}
),
[("section1", 1, 2)],
),
(
polars.DataFrame(
{
"col1": ["header1", None, "[section1]", "col1", "value1"],
"col2": [None, None, None, "col2", "value2"],
}
),
[("section1", 3, 2)],
),
(
polars.DataFrame(
{
"col1": [None, None, "[section1]", "col1", "value1"],
"col2": [None, None, None, "col2", "value2"],
}
),
[("section1", 3, 2)],
),
(
polars.DataFrame(
{
"col1": [None, None, "[section1]", None, "value1"],
"col2": [None, None, None, "col2", "value2"],
}
),
[("section1", 3, 2)],
),
(
polars.DataFrame(
{
"col1": [
None,
None,
"[section1]",
"col1",
"value1",
None,
"[section2]",
"col1",
"value1",
],
"col2": [
None,
None,
None,
"col2",
"value2",
None,
None,
"col2",
"value2",
],
}
),
[("section1", 3, 2), ("section2", 7, 2)],
),
(
polars.DataFrame(
{
"col1": ["[section1]", "col1", "value1"],
"col2": [None, "col2", "value2"],
"col3": [None, None, None],
}
),
[("section1", 1, 2)],
),
],
)
def test_get_section_idx(input, want):
got = _get_section_idx(input)
assert got == want


@mark.parametrize(
"inputs, want",
[
(
(
polars.DataFrame(
{"col1": [None, "header2"], "col2": ["header1", None]}
),
2,
),
["header1", "header2"],
)
],
)
def test_parse_headers(inputs, want):
got = _parse_headers(inputs[0], inputs[1])
assert got == want


@mark.parametrize(
"input, want",
[
(
polars.DataFrame(
{
"col1": ["col1", "value1"],
"col2": ["col2", "value2"],
"col3": [None, None],
}
),
polars.DataFrame({"col1": ["col1", "value1"], "col2": ["col2", "value2"]}),
),
(
polars.DataFrame(
{
"col1": [None, "value1"],
"col2": ["col2", "value2"],
"col3": [None, None],
}
),
polars.DataFrame({"col1": ["-", "value1"], "col2": ["col2", "value2"]}),
),
],
)
def test_handle_row_with_nulls(input, want):
got = _handle_row_with_nulls(input)
print(got)
assert got.equals(want)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
header1

[section1]
col2
value1 value3
value2 value4
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
header1



[section1]
col1 col2
value1 value3
value2 value4
Loading