diff --git a/skrub/__init__.py b/skrub/__init__.py index 67143ec9a..f4564e106 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -41,6 +41,7 @@ from ._similarity_encoder import SimilarityEncoder from ._squashing_scaler import SquashingScaler from ._string_encoder import StringEncoder +from ._string_parser import StringParser from ._table_vectorizer import Cleaner, TableVectorizer from ._tabular_pipeline import tabular_learner, tabular_pipeline from ._text_encoder import TextEncoder @@ -104,4 +105,5 @@ "ApplyToCols", "ApplyToFrame", "ToFloat", + "StringParser", ] diff --git a/skrub/_string_parser.py b/skrub/_string_parser.py new file mode 100644 index 000000000..6083405cf --- /dev/null +++ b/skrub/_string_parser.py @@ -0,0 +1,51 @@ +from . import _dataframe as sbd +from ._on_each_column import SingleColumnTransformer + + +class StringParser(SingleColumnTransformer): + """ + """ + def __init__(self, dictionary): + self.dictionary = dictionary + + def fit_transform(self, X, y=None): + del y + + return self.transform(X) + + def transform(self, X): + breakpoint() + col_name = sbd.name(X) or "string" + + X_str = X.astype(str).str.lower().str.strip() + + pattern = ( + r"(?:(?P[a-zA-Z]+)\s*(?P[-+]?\d*\.?)" + r"|(?:(?P[-+]?\d*\.?\d+)\s*(?P[a-zA-Z]+))") + + parsed = X_str.str.extract(pattern) + parsed["unit"] = parsed["unit1"].combine_first(parsed["unit2"]) + parsed["value"] = parsed["value1"].combine_first(parsed["value2"]) + + parsed = parsed.drop(columns=["unit1", "value1", "unit2", "value2"]) + + parsed["value"] = parsed["value"].str.replace(",", ".").astype(float) + + base_unit = next( + (u for u, f in self.dictionary.items() if f == 1.0), + list(self.dictionary.keys())[0], + ) + + parsed["unit"] = parsed["unit"].fillna(base_unit) + + parsed["factor"] = parsed["unit"].map(self.dictionary) + result = parsed["value"] * parsed["factor"] + result.name = f"{col_name}_{base_unit}" + result.index = X.index + + self.input_name_ = col_name + self.all_outputs_ = [result.name] + self._is_fitted = True + + return result +