From 0a22b311003b336e0cbcdf4a6fa3ac9bb67fdd23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Thu, 4 Jun 2026 17:43:08 +0200 Subject: [PATCH 1/4] Created basic `_list_transformation` function --- skrub/_table_vectorizer.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 2fea9bedf..05d034c41 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -181,6 +181,26 @@ def _get_preprocessors( return steps +def _list_transformations(estimator): + for step in estimator._pipeline.named_steps: + if step == "checkinputdataframe": + continue + transformer = estimator._pipeline.named_steps[step] + # "transformer" is just ApplyToEachCol, so we need to get the actual transformer + match transformer.transformer: + case DropUninformative(): + print("DropUninformative - ") + dropped = set(transformer.all_inputs_) - set(transformer.all_outputs_) + print(f"Dropped columns {dropped} - ") + print(f"Used inputs: {transformer.used_inputs_} - ") + case ToFloat(): + print("ToFloat - ") + print(f"Columns transformed to float: {transformer.used_inputs_} - ") + case ToDatetime(): + print("ToDatetime - ") + print(f"Columns transformed to datetime: {transformer.used_inputs_} - ") + + class Cleaner(TransformerMixin, BaseEstimator): """Column-wise consistency checks and sanitization of dtypes, null values and dates. From e93c7ec879f45068fb6845126a0b3ee78a50b0c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Wed, 10 Jun 2026 14:25:21 +0200 Subject: [PATCH 2/4] Rough version of `list_transformations` --- skrub/_table_vectorizer.py | 91 ++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 05d034c41..82746d193 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -182,23 +182,37 @@ def _get_preprocessors( def _list_transformations(estimator): + message = "" for step in estimator._pipeline.named_steps: if step == "checkinputdataframe": continue transformer = estimator._pipeline.named_steps[step] - # "transformer" is just ApplyToEachCol, so we need to get the actual transformer match transformer.transformer: case DropUninformative(): - print("DropUninformative - ") dropped = set(transformer.all_inputs_) - set(transformer.all_outputs_) - print(f"Dropped columns {dropped} - ") - print(f"Used inputs: {transformer.used_inputs_} - ") + if dropped != set(): + message += "DropUninformative - " + "\n" + message += f"Dropped columns {dropped}" + "\n" + message += f"Used inputs: {transformer.used_inputs_} - " + "\n" case ToFloat(): - print("ToFloat - ") - print(f"Columns transformed to float: {transformer.used_inputs_} - ") + message += "ToFloat - " + "\n" + message += ( + f"Columns transformed to float: {transformer.used_inputs_} - " + + "\n" + ) case ToDatetime(): - print("ToDatetime - ") - print(f"Columns transformed to datetime: {transformer.used_inputs_} - ") + message += "ToDatetime - " + "\n" + message += ( + f"Columns transformed to datetime: {transformer.used_inputs_} - " + + "\n" + ) + case CleanNullStrings(): + message += "CleanNullStrings - " + "\n" + message += ( + f"Columns with standardized nulls: {transformer.used_inputs_} - " + + "\n" + ) + return message class Cleaner(TransformerMixin, BaseEstimator): @@ -558,6 +572,9 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) + def list_transformations(self): + print(_list_transformations(self)) + class TableVectorizer(TransformerMixin, BaseEstimator): """Transform a dataframe to a numeric (vectorized) representation. @@ -1184,3 +1201,61 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) + + def list_transformations(self): + preprocessing_transformations = _list_transformations(self) + vectorize_transformations = "" + specific_transformations = "Specific transformations: " + "\n" + + for step in self._pipeline.named_steps: + if step == "checkinputdataframe": + continue + transformer = self._pipeline.named_steps[step] + match transformer.transformer: + case type(self.numeric): + vectorize_transformations += ( + f"Numeric transformer: {self.numeric}" + "\n" + ) + vectorize_transformations += ( + f"Numerical columns transformed: {transformer.used_inputs}" + + "\n" + ) + case type(self.datetime): + vectorize_transformations += ( + f"Datetime transformer: {self.datetime}" + "\n" + ) + vectorize_transformations += ( + f"Datetime columns transformed: {transformer.used_inputs}" + + "\n" + ) + case type(self.low_cardinality): + vectorize_transformations += ( + f"Low-cardinality transformer: {self.low_cardinality}" + "\n" + ) + vectorize_transformations += ( + f"Low-cardinality columns transformed: \ + {transformer.used_inputs}" + + "\n" + ) + case type(self.high_cardinality): + vectorize_transformations += ( + f"High-cardinality transformer: {self.high_cardinality}" + "\n" + ) + vectorize_transformations += ( + f"High-cardinality columns transformed: \ + {self.high_cardinality.used_inputs}" + + "\n" + ) + if transformer.transformer in self.specific_transformers: + specific_transformations += ( + f"{transformer.transformer} applied to: {transformer.used_inputs}" + + "\n" + ) + + return ( + preprocessing_transformations + + "\n\n" + + vectorize_transformations + + "\n\n" + + specific_transformations + ) From 36a6e712925fdec6e80a3bf1b72db259b1585b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Mon, 15 Jun 2026 16:18:24 +0200 Subject: [PATCH 3/4] Removed pattern matching for custom transformers --- skrub/_table_vectorizer.py | 74 +++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 82746d193..c89166b70 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -1210,48 +1210,46 @@ def list_transformations(self): for step in self._pipeline.named_steps: if step == "checkinputdataframe": continue + transformer = self._pipeline.named_steps[step] - match transformer.transformer: - case type(self.numeric): - vectorize_transformations += ( - f"Numeric transformer: {self.numeric}" + "\n" - ) - vectorize_transformations += ( - f"Numerical columns transformed: {transformer.used_inputs}" - + "\n" - ) - case type(self.datetime): - vectorize_transformations += ( - f"Datetime transformer: {self.datetime}" + "\n" - ) - vectorize_transformations += ( - f"Datetime columns transformed: {transformer.used_inputs}" - + "\n" - ) - case type(self.low_cardinality): - vectorize_transformations += ( - f"Low-cardinality transformer: {self.low_cardinality}" + "\n" - ) - vectorize_transformations += ( - f"Low-cardinality columns transformed: \ - {transformer.used_inputs}" - + "\n" - ) - case type(self.high_cardinality): - vectorize_transformations += ( - f"High-cardinality transformer: {self.high_cardinality}" + "\n" - ) - vectorize_transformations += ( - f"High-cardinality columns transformed: \ - {self.high_cardinality.used_inputs}" - + "\n" - ) + trans_type = type(transformer.transformer) + + if trans_type == type(self.numeric): + vectorize_transformations += ( + f"Numeric transformer: {self.numeric}" + "\n" + ) + vectorize_transformations += ( + f"applied to numerical columns {transformer.used_inputs_}" + "\n" + ) + elif trans_type == type(self.datetime): + vectorize_transformations += ( + f"Datetime transformer: {self.datetime}" + "\n" + ) + vectorize_transformations += ( + f"applied to datetime columns {transformer.used_inputs_}" + "\n" + ) + elif trans_type == type(self.low_cardinality): + vectorize_transformations += ( + f"Low-cardinality transformer: {self.low_cardinality}" + "\n" + ) + vectorize_transformations += ( + f"applied to low-cardinality columns \ + {transformer.used_inputs_}" + + "\n" + ) + elif trans_type == type(self.high_cardinality): + vectorize_transformations += ( + f"High-cardinality transformer: {self.high_cardinality}" + "\n" + ) + vectorize_transformations += ( + f"applied to high-cardinality columns \ + {transformer.used_inputs_}" + + "\n" + ) if transformer.transformer in self.specific_transformers: specific_transformations += ( - f"{transformer.transformer} applied to: {transformer.used_inputs}" - + "\n" + f"{transformer} applied to: {transformer.used_inputs_}" + "\n" ) - return ( preprocessing_transformations + "\n\n" From 1270c712caa09570d0a4647f975d2e30debebe0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Fri, 19 Jun 2026 15:59:49 +0200 Subject: [PATCH 4/4] Introduced basic test and simplified transformer list --- skrub/_table_vectorizer.py | 66 ++++++++++------------------ skrub/tests/test_table_vectorizer.py | 13 ++++++ 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index c89166b70..e83172b46 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -193,23 +193,22 @@ def _list_transformations(estimator): if dropped != set(): message += "DropUninformative - " + "\n" message += f"Dropped columns {dropped}" + "\n" - message += f"Used inputs: {transformer.used_inputs_} - " + "\n" + message += f"Used inputs: {transformer.used_inputs_}" + "\n" case ToFloat(): message += "ToFloat - " + "\n" message += ( - f"Columns transformed to float: {transformer.used_inputs_} - " - + "\n" + f"Columns transformed to float: {transformer.used_inputs_}" + "\n" ) case ToDatetime(): message += "ToDatetime - " + "\n" message += ( - f"Columns transformed to datetime: {transformer.used_inputs_} - " + f"Columns transformed to datetime: {transformer.used_inputs_}" + "\n" ) case CleanNullStrings(): message += "CleanNullStrings - " + "\n" message += ( - f"Columns with standardized nulls: {transformer.used_inputs_} - " + f"Columns with standardized nulls: {transformer.used_inputs_}" + "\n" ) return message @@ -573,7 +572,7 @@ def get_feature_names_out(self, input_features=None): return np.asarray(self.all_outputs_) def list_transformations(self): - print(_list_transformations(self)) + return _list_transformations(self) class TableVectorizer(TransformerMixin, BaseEstimator): @@ -1205,51 +1204,32 @@ def get_feature_names_out(self, input_features=None): def list_transformations(self): preprocessing_transformations = _list_transformations(self) vectorize_transformations = "" - specific_transformations = "Specific transformations: " + "\n" - - for step in self._pipeline.named_steps: - if step == "checkinputdataframe": - continue + specific_transformations = "" - transformer = self._pipeline.named_steps[step] - trans_type = type(transformer.transformer) + all_transformers = self.kind_to_columns_ + specific = all_transformers.pop("specific") - if trans_type == type(self.numeric): - vectorize_transformations += ( - f"Numeric transformer: {self.numeric}" + "\n" - ) - vectorize_transformations += ( - f"applied to numerical columns {transformer.used_inputs_}" + "\n" - ) - elif trans_type == type(self.datetime): - vectorize_transformations += ( - f"Datetime transformer: {self.datetime}" + "\n" - ) - vectorize_transformations += ( - f"applied to datetime columns {transformer.used_inputs_}" + "\n" - ) - elif trans_type == type(self.low_cardinality): - vectorize_transformations += ( - f"Low-cardinality transformer: {self.low_cardinality}" + "\n" - ) + for transformer_type, transformer_cols in all_transformers.items(): + if transformer_cols != []: vectorize_transformations += ( - f"applied to low-cardinality columns \ - {transformer.used_inputs_}" + f"{transformer_type} transformer is \ + {getattr(self, transformer_type)} \ + and was applied to {transformer_cols}." + "\n" ) - elif trans_type == type(self.high_cardinality): - vectorize_transformations += ( - f"High-cardinality transformer: {self.high_cardinality}" + "\n" - ) + else: vectorize_transformations += ( - f"applied to high-cardinality columns \ - {transformer.used_inputs_}" + f"{transformer_type} transformer is \ + {getattr(self, transformer_type)} \ + and was applied to nothing." + "\n" ) - if transformer.transformer in self.specific_transformers: - specific_transformations += ( - f"{transformer} applied to: {transformer.used_inputs_}" + "\n" - ) + + if self.specific_transformers != (): + for t in self.specific_transformers: + specific_transformations += f"specific transformer \ + {t} was applied to {specific}" + return ( preprocessing_transformations + "\n\n" diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 43d438071..6a41d67e6 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -28,6 +28,7 @@ from skrub._to_float import ToFloat from skrub._to_str import ToStr from skrub.conftest import _POLARS_INSTALLED +from skrub.datasets._generating import toy_cities MSG_PANDAS_DEPRECATED_WARNING = "Skip deprecation warning" @@ -1277,3 +1278,15 @@ def test_duration_to_float(df_module): vectorizer = Cleaner() transformed = vectorizer.fit_transform(df) df_module.assert_column_equal(transformed["duration"], df["duration"]) + + +def test_list_transformations(df_module): + df = toy_cities() + + vectorizer = TableVectorizer() + _ = vectorizer.fit_transform(df) + _ = vectorizer.list_transformations() + + vectorizer = Cleaner() + _ = vectorizer.fit_transform(df) + _ = vectorizer.list_transformations()