InPreD · marrip · Jun 10, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
@@ -3,7 +3,12 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"]
 build-backend = "setuptools.build_meta"
 
 [project]
-dependencies = ["cyvcf2>=0.32.1", "msgspec>=0.21.1", "typer>=0.24.1"]
+dependencies = [
+    "cyvcf2>=0.32.1",
+    "msgspec>=0.21.1",
+    "polars>=1.39.3",
+    "typer>=0.24.1",
+]
 dynamic = ["version"]
 name = "tsoppy"
 requires-python = ">=3.14"

@@ -0,0 +1,98 @@
+import logging
+import re
+
+import polars
+
+# Use logger that was set up in CLI
+logger = logging.getLogger(__name__)
+
+
+def Parse_section_tsv(
+    path: str, key_value_sections: list[str]
+) -> tuple[list[str], dict[str, polars.DataFrame]]:
+    """Parse a sectioned TSV file into headers and a mapping of section names to DataFrames."""
+    try:
+        df = polars.read_csv(path, separator="\t", has_header=False)
+    except FileNotFoundError:
+        logger.error(f"File {path} not found.")
+        raise
+    except polars.exceptions.NoDataError:
+        logger.error(f"File {path} is empty.")
+        raise
+    section_idx = _get_section_idx(df)
+    headers = []
+    if section_idx[0][1] != 1:
+        headers = _parse_headers(df, section_idx[0][1] - 1)
+    section_dfs = {}
+    for section in section_idx:
+        # create slice of dataframe for the section
+        df_slice = df.slice(section[1], section[2])
+
+        # check if row contains null values
+        if any(item is None for item in df_slice.row(0)):
+            df_slice = _handle_row_with_nulls(df_slice)
+
+        # check if section is a key value section
+        if section[0] in key_value_sections:
+            # check that the section only contains two columns and transpose else log a warning
+            if df_slice.width == 2:
+                df_slice = df_slice.transpose()
+            else:
+                logger.warning(
+                    f"Section {section[0]} is supposed to be a key value section but contains more than two columns."
+                )
+
+        # assume first row contains column names
+        df_header = df_slice.head(1).to_dicts().pop()
+
+        # remove first row and rename columns and link it to the section name
+        section_dfs[section[0]] = df_slice.rename(df_header).slice(1)
+    return headers, section_dfs
+
+
+def _get_section_idx(df: polars.DataFrame) -> list[tuple[str, int, int]]:
+    """Get the the name, start index and length of each section in the DataFrame."""
+    section = ""
+    section_start = 0
+    section_length = 0
+    section_idx = []
+    for row in df.with_row_index().iter_rows():
+        if row[1]:
+            match = re.search(r"^\[(?P<section>.*)\]$", row[1])
+            if match:
+                section_start = row[0] + 1
+                section = match.group("section")
+        if all(item is None for item in row[1:]):
+            section_length = row[0] - section_start
+            if section_start > 0 and section_length > 0:
+                section_idx.append((section, section_start, section_length))
+                section_start = 0
+                section_length = 0
+        if row[0] == len(df) - 1:
+            section_length = row[0] - section_start + 1
+            if section_start > 0 and section_length > 0:
+                section_idx.append((section, section_start, section_length))
+    return section_idx
+
+
+def _parse_headers(df: polars.DataFrame, header_rows: int) -> list[str]:
+    """Parse the first rows from the top of the DataFrame as headers."""
+    headers = []
+    for row in df.head(header_rows).rows():
+        for el in row:
+            if el is not None:
+                headers.append(el)
+    return headers
+
+
+def _handle_row_with_nulls(df: polars.DataFrame) -> polars.DataFrame:
+    """Handle rows with null values removing empty columns and by filling with "-"."""
+
+    # remove any columns that are completely null (no column header nor values)
+    df = df.select(
+        [polars.col(col) for col in df.columns if not df[col].null_count() == df.height]
+    )
+
+    # avoid null values by filling with "-"
+    df = df.with_columns(polars.all().cast(polars.String).fill_null("-"))
+    return df
@@ -0,0 +1,266 @@
+from contextlib import nullcontext
+from os import path
+
+import polars
+from pytest import mark, raises
+
+from tsoppy.general.file_parser import (
+    Parse_section_tsv,
+    _get_section_idx,
+    _handle_row_with_nulls,
+    _parse_headers,
+)
+
+# Define path to test data - cannot be absolute due to different paths locally and in CI
+test_data_dir = "tests/test_data/general"
+
+
+@mark.parametrize(
+    "inputs, exception, want",
+    [
+        (
+            (path.join(test_data_dir, "parse_section_tsv/standard.tsv"), []),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/multiple_sections.tsv"), []),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    ),
+                    "section2": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    ),
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/no_headers.tsv"), []),
+            nullcontext(),
+            (
+                [],
+                {
+                    "section1": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/extra_empty_lines.tsv"), []),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/null_columns.tsv"), []),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"col1": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (
+                path.join(
+                    test_data_dir, "parse_section_tsv/empty_first_column_name.tsv"
+                ),
+                [],
+            ),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"-": ["value1", "value2"], "col2": ["value3", "value4"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/key_value.tsv"), ["section1"]),
+            nullcontext(),
+            (
+                ["header1"],
+                {
+                    "section1": polars.DataFrame(
+                        {"key1": ["value1"], "key2": ["value2"]}
+                    )
+                },
+            ),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/non-existent.tsv"), []),
+            raises(FileNotFoundError),
+            ([], {}),
+        ),
+        (
+            (path.join(test_data_dir, "parse_section_tsv/empty.tsv"), []),
+            raises(polars.exceptions.NoDataError),
+            ([], {}),
+        ),
+    ],
+)
+def test_parse_section_tsv(inputs, exception, want):
+    with exception:
+        got = Parse_section_tsv(inputs[0], inputs[1])
+        assert got[0] == want[0]
+        for key in want[1].keys():
+            assert key in got[1]
+            assert got[1][key].equals(want[1][key])
+
+
+@mark.parametrize(
+    "input, want",
+    [
+        (
+            polars.DataFrame(
+                {
+                    "col1": ["[section1]", "col1", "value1"],
+                    "col2": [None, "col2", "value2"],
+                }
+            ),
+            [("section1", 1, 2)],
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": ["header1", None, "[section1]", "col1", "value1"],
+                    "col2": [None, None, None, "col2", "value2"],
+                }
+            ),
+            [("section1", 3, 2)],
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": [None, None, "[section1]", "col1", "value1"],
+                    "col2": [None, None, None, "col2", "value2"],
+                }
+            ),
+            [("section1", 3, 2)],
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": [None, None, "[section1]", None, "value1"],
+                    "col2": [None, None, None, "col2", "value2"],
+                }
+            ),
+            [("section1", 3, 2)],
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": [
+                        None,
+                        None,
+                        "[section1]",
+                        "col1",
+                        "value1",
+                        None,
+                        "[section2]",
+                        "col1",
+                        "value1",
+                    ],
+                    "col2": [
+                        None,
+                        None,
+                        None,
+                        "col2",
+                        "value2",
+                        None,
+                        None,
+                        "col2",
+                        "value2",
+                    ],
+                }
+            ),
+            [("section1", 3, 2), ("section2", 7, 2)],
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": ["[section1]", "col1", "value1"],
+                    "col2": [None, "col2", "value2"],
+                    "col3": [None, None, None],
+                }
+            ),
+            [("section1", 1, 2)],
+        ),
+    ],
+)
+def test_get_section_idx(input, want):
+    got = _get_section_idx(input)
+    assert got == want
+
+
+@mark.parametrize(
+    "inputs, want",
+    [
+        (
+            (
+                polars.DataFrame(
+                    {"col1": [None, "header2"], "col2": ["header1", None]}
+                ),
+                2,
+            ),
+            ["header1", "header2"],
+        )
+    ],
+)
+def test_parse_headers(inputs, want):
+    got = _parse_headers(inputs[0], inputs[1])
+    assert got == want
+
+
+@mark.parametrize(
+    "input, want",
+    [
+        (
+            polars.DataFrame(
+                {
+                    "col1": ["col1", "value1"],
+                    "col2": ["col2", "value2"],
+                    "col3": [None, None],
+                }
+            ),
+            polars.DataFrame({"col1": ["col1", "value1"], "col2": ["col2", "value2"]}),
+        ),
+        (
+            polars.DataFrame(
+                {
+                    "col1": [None, "value1"],
+                    "col2": ["col2", "value2"],
+                    "col3": [None, None],
+                }
+            ),
+            polars.DataFrame({"col1": ["-", "value1"], "col2": ["col2", "value2"]}),
+        ),
+    ],
+)
+def test_handle_row_with_nulls(input, want):
+    got = _handle_row_with_nulls(input)
+    print(got)
+    assert got.equals(want)
@@ -0,0 +1,6 @@
+header1
+
+[section1]
+	col2
+value1	value3
+value2	value4
@@ -0,0 +1,8 @@
+header1
+
+
+
+[section1]
+col1	col2
+value1	value3
+value2	value4