-
Notifications
You must be signed in to change notification settings - Fork 0
Add sec tsv parser #31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
985d65d
feat: move parser to file_parser module and change behavior to checki…
marrip d4e3fb4
fix: don't assume key value store but have user define it
marrip 93f331b
chore: ruff lint
marrip bd0737b
docs: update comment to make purpose clearer
marrip 4a83af3
fix: add init.py to mark subpackage general
marrip 60d28ee
test: add test_data and unittests for Parse_section_Tsv
marrip 962e93a
feat: intercept file not found and no data and log
marrip a7a2712
test: add test cases and data for empty file and non-existent
marrip dbe533f
fix: correct output signature for _get_section_idx
marrip 18dfbde
test: add unit tests for _get_section_idx
marrip 56a10e2
test: add unit tests for _parse_headers and _handle_row_with_nulls
marrip f4898e9
chore: lint ruff and isort
marrip 0b9fbd7
chore: lint ruff
marrip 2a59e8d
chore: lint ruff
marrip ea01cf0
chore: install polars
marrip 2b31aae
docs: update docstring with @danielvo 's suggestion
marrip 016a5dd
docs: add short comments to describe test cases
marrip abd8898
chore: include pytest in dev deps
marrip 86b33d9
docs: include description of Parse_section_tsv
marrip ac891ae
docs: include header and MLA citation for cyvcf paper
marrip 0978c77
docs: correct docstring according to @danielvo 's suggestion
marrip File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| import logging | ||
| import re | ||
|
|
||
| import polars | ||
|
|
||
| # Use logger that was set up in CLI | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def Parse_section_tsv( | ||
| path: str, key_value_sections: list[str] | ||
| ) -> tuple[list[str], dict[str, polars.DataFrame]]: | ||
| """Parse a sectioned TSV file into headers and a mapping of section names to DataFrames.""" | ||
| try: | ||
| df = polars.read_csv(path, separator="\t", has_header=False) | ||
| except FileNotFoundError: | ||
| logger.error(f"File {path} not found.") | ||
| raise | ||
| except polars.exceptions.NoDataError: | ||
| logger.error(f"File {path} is empty.") | ||
| raise | ||
| section_idx = _get_section_idx(df) | ||
| headers = [] | ||
| if section_idx[0][1] != 1: | ||
| headers = _parse_headers(df, section_idx[0][1] - 1) | ||
| section_dfs = {} | ||
| for section in section_idx: | ||
| # create slice of dataframe for the section | ||
| df_slice = df.slice(section[1], section[2]) | ||
|
|
||
| # check if row contains null values | ||
| if any(item is None for item in df_slice.row(0)): | ||
| df_slice = _handle_row_with_nulls(df_slice) | ||
|
|
||
| # check if section is a key value section | ||
| if section[0] in key_value_sections: | ||
| # check that the section only contains two columns and transpose else log a warning | ||
| if df_slice.width == 2: | ||
| df_slice = df_slice.transpose() | ||
| else: | ||
| logger.warning( | ||
| f"Section {section[0]} is supposed to be a key value section but contains more than two columns." | ||
| ) | ||
|
|
||
| # assume first row contains column names | ||
| df_header = df_slice.head(1).to_dicts().pop() | ||
|
|
||
| # remove first row and rename columns and link it to the section name | ||
| section_dfs[section[0]] = df_slice.rename(df_header).slice(1) | ||
| return headers, section_dfs | ||
|
|
||
|
|
||
| def _get_section_idx(df: polars.DataFrame) -> list[tuple[str, int, int]]: | ||
| """Get the the name, start index and length of each section in the DataFrame.""" | ||
| section = "" | ||
| section_start = 0 | ||
| section_length = 0 | ||
| section_idx = [] | ||
| for row in df.with_row_index().iter_rows(): | ||
| if row[1]: | ||
| match = re.search(r"^\[(?P<section>.*)\]$", row[1]) | ||
| if match: | ||
| section_start = row[0] + 1 | ||
| section = match.group("section") | ||
| if all(item is None for item in row[1:]): | ||
| section_length = row[0] - section_start | ||
| if section_start > 0 and section_length > 0: | ||
| section_idx.append((section, section_start, section_length)) | ||
| section_start = 0 | ||
| section_length = 0 | ||
| if row[0] == len(df) - 1: | ||
| section_length = row[0] - section_start + 1 | ||
| if section_start > 0 and section_length > 0: | ||
| section_idx.append((section, section_start, section_length)) | ||
| return section_idx | ||
|
|
||
|
|
||
| def _parse_headers(df: polars.DataFrame, header_rows: int) -> list[str]: | ||
| """Parse the first rows from the top of the DataFrame as headers.""" | ||
| headers = [] | ||
| for row in df.head(header_rows).rows(): | ||
| for el in row: | ||
| if el is not None: | ||
| headers.append(el) | ||
| return headers | ||
|
|
||
|
|
||
| def _handle_row_with_nulls(df: polars.DataFrame) -> polars.DataFrame: | ||
| """Handle rows with null values removing empty columns and by filling with "-".""" | ||
|
|
||
| # remove any columns that are completely null (no column header nor values) | ||
| df = df.select( | ||
| [polars.col(col) for col in df.columns if not df[col].null_count() == df.height] | ||
| ) | ||
|
|
||
| # avoid null values by filling with "-" | ||
| df = df.with_columns(polars.all().cast(polars.String).fill_null("-")) | ||
| return df | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,266 @@ | ||
| from contextlib import nullcontext | ||
| from os import path | ||
|
|
||
| import polars | ||
| from pytest import mark, raises | ||
|
|
||
| from tsoppy.general.file_parser import ( | ||
| Parse_section_tsv, | ||
| _get_section_idx, | ||
| _handle_row_with_nulls, | ||
| _parse_headers, | ||
| ) | ||
|
|
||
| # Define path to test data - cannot be absolute due to different paths locally and in CI | ||
| test_data_dir = "tests/test_data/general" | ||
|
|
||
|
|
||
| @mark.parametrize( | ||
| "inputs, exception, want", | ||
| [ | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/standard.tsv"), []), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/multiple_sections.tsv"), []), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ), | ||
| "section2": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ), | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/no_headers.tsv"), []), | ||
| nullcontext(), | ||
| ( | ||
| [], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/extra_empty_lines.tsv"), []), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/null_columns.tsv"), []), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"col1": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| ( | ||
| path.join( | ||
| test_data_dir, "parse_section_tsv/empty_first_column_name.tsv" | ||
| ), | ||
| [], | ||
| ), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"-": ["value1", "value2"], "col2": ["value3", "value4"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/key_value.tsv"), ["section1"]), | ||
| nullcontext(), | ||
| ( | ||
| ["header1"], | ||
| { | ||
| "section1": polars.DataFrame( | ||
| {"key1": ["value1"], "key2": ["value2"]} | ||
| ) | ||
| }, | ||
| ), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/non-existent.tsv"), []), | ||
| raises(FileNotFoundError), | ||
| ([], {}), | ||
| ), | ||
| ( | ||
| (path.join(test_data_dir, "parse_section_tsv/empty.tsv"), []), | ||
| raises(polars.exceptions.NoDataError), | ||
| ([], {}), | ||
| ), | ||
| ], | ||
| ) | ||
| def test_parse_section_tsv(inputs, exception, want): | ||
| with exception: | ||
| got = Parse_section_tsv(inputs[0], inputs[1]) | ||
| assert got[0] == want[0] | ||
| for key in want[1].keys(): | ||
| assert key in got[1] | ||
| assert got[1][key].equals(want[1][key]) | ||
|
|
||
|
|
||
| @mark.parametrize( | ||
| "input, want", | ||
| [ | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": ["[section1]", "col1", "value1"], | ||
| "col2": [None, "col2", "value2"], | ||
| } | ||
| ), | ||
| [("section1", 1, 2)], | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": ["header1", None, "[section1]", "col1", "value1"], | ||
| "col2": [None, None, None, "col2", "value2"], | ||
| } | ||
| ), | ||
| [("section1", 3, 2)], | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": [None, None, "[section1]", "col1", "value1"], | ||
| "col2": [None, None, None, "col2", "value2"], | ||
| } | ||
| ), | ||
| [("section1", 3, 2)], | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": [None, None, "[section1]", None, "value1"], | ||
| "col2": [None, None, None, "col2", "value2"], | ||
| } | ||
| ), | ||
| [("section1", 3, 2)], | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": [ | ||
| None, | ||
| None, | ||
| "[section1]", | ||
| "col1", | ||
| "value1", | ||
| None, | ||
| "[section2]", | ||
| "col1", | ||
| "value1", | ||
| ], | ||
| "col2": [ | ||
| None, | ||
| None, | ||
| None, | ||
| "col2", | ||
| "value2", | ||
| None, | ||
| None, | ||
| "col2", | ||
| "value2", | ||
| ], | ||
| } | ||
| ), | ||
| [("section1", 3, 2), ("section2", 7, 2)], | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": ["[section1]", "col1", "value1"], | ||
| "col2": [None, "col2", "value2"], | ||
| "col3": [None, None, None], | ||
| } | ||
| ), | ||
| [("section1", 1, 2)], | ||
| ), | ||
| ], | ||
| ) | ||
| def test_get_section_idx(input, want): | ||
| got = _get_section_idx(input) | ||
| assert got == want | ||
|
|
||
|
|
||
| @mark.parametrize( | ||
| "inputs, want", | ||
| [ | ||
| ( | ||
| ( | ||
| polars.DataFrame( | ||
| {"col1": [None, "header2"], "col2": ["header1", None]} | ||
| ), | ||
| 2, | ||
| ), | ||
| ["header1", "header2"], | ||
| ) | ||
| ], | ||
| ) | ||
| def test_parse_headers(inputs, want): | ||
| got = _parse_headers(inputs[0], inputs[1]) | ||
| assert got == want | ||
|
|
||
|
|
||
| @mark.parametrize( | ||
| "input, want", | ||
| [ | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": ["col1", "value1"], | ||
| "col2": ["col2", "value2"], | ||
| "col3": [None, None], | ||
| } | ||
| ), | ||
| polars.DataFrame({"col1": ["col1", "value1"], "col2": ["col2", "value2"]}), | ||
| ), | ||
| ( | ||
| polars.DataFrame( | ||
| { | ||
| "col1": [None, "value1"], | ||
| "col2": ["col2", "value2"], | ||
| "col3": [None, None], | ||
| } | ||
| ), | ||
| polars.DataFrame({"col1": ["-", "value1"], "col2": ["col2", "value2"]}), | ||
| ), | ||
| ], | ||
| ) | ||
| def test_handle_row_with_nulls(input, want): | ||
| got = _handle_row_with_nulls(input) | ||
| print(got) | ||
| assert got.equals(want) |
Empty file.
6 changes: 6 additions & 0 deletions
6
tests/test_data/general/parse_section_tsv/empty_first_column_name.tsv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| header1 | ||
|
|
||
| [section1] | ||
| col2 | ||
| value1 value3 | ||
| value2 value4 |
8 changes: 8 additions & 0 deletions
8
tests/test_data/general/parse_section_tsv/extra_empty_lines.tsv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| header1 | ||
|
|
||
|
|
||
|
|
||
| [section1] | ||
| col1 col2 | ||
| value1 value3 | ||
| value2 value4 |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.