Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions skrub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from ._similarity_encoder import SimilarityEncoder
from ._squashing_scaler import SquashingScaler
from ._string_encoder import StringEncoder
from ._string_parser import StringParser
from ._table_vectorizer import Cleaner, TableVectorizer
from ._tabular_pipeline import tabular_learner, tabular_pipeline
from ._text_encoder import TextEncoder
Expand Down Expand Up @@ -104,4 +105,5 @@
"ApplyToCols",
"ApplyToFrame",
"ToFloat",
"StringParser",
]
51 changes: 51 additions & 0 deletions skrub/_string_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from . import _dataframe as sbd
from ._on_each_column import SingleColumnTransformer


class StringParser(SingleColumnTransformer):
"""
"""
def __init__(self, dictionary):
self.dictionary = dictionary

def fit_transform(self, X, y=None):
del y

return self.transform(X)

def transform(self, X):
breakpoint()
col_name = sbd.name(X) or "string"

X_str = X.astype(str).str.lower().str.strip()

pattern = (
r"(?:(?P<unit1>[a-zA-Z]+)\s*(?P<value1>[-+]?\d*\.?)"
r"|(?:(?P<value2>[-+]?\d*\.?\d+)\s*(?P<unit2>[a-zA-Z]+))")

parsed = X_str.str.extract(pattern)
parsed["unit"] = parsed["unit1"].combine_first(parsed["unit2"])
parsed["value"] = parsed["value1"].combine_first(parsed["value2"])

parsed = parsed.drop(columns=["unit1", "value1", "unit2", "value2"])

parsed["value"] = parsed["value"].str.replace(",", ".").astype(float)

base_unit = next(
(u for u, f in self.dictionary.items() if f == 1.0),
list(self.dictionary.keys())[0],
)

parsed["unit"] = parsed["unit"].fillna(base_unit)

parsed["factor"] = parsed["unit"].map(self.dictionary)
result = parsed["value"] * parsed["factor"]
result.name = f"{col_name}_{base_unit}"
result.index = X.index

self.input_name_ = col_name
self.all_outputs_ = [result.name]
self._is_fitted = True

return result

Loading