From 0a22b311003b336e0cbcdf4a6fa3ac9bb67fdd23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= <eloi.massoulie@inria.fr>
Date: Thu, 4 Jun 2026 17:43:08 +0200
Subject: [PATCH 1/4] Created basic `_list_transformation` function

---
 skrub/_table_vectorizer.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 2fea9bedf..05d034c41 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -181,6 +181,26 @@ def _get_preprocessors(
     return steps
 
 
+def _list_transformations(estimator):
+    for step in estimator._pipeline.named_steps:
+        if step == "checkinputdataframe":
+            continue
+        transformer = estimator._pipeline.named_steps[step]
+        # "transformer" is just ApplyToEachCol, so we need to get the actual transformer
+        match transformer.transformer:
+            case DropUninformative():
+                print("DropUninformative - ")
+                dropped = set(transformer.all_inputs_) - set(transformer.all_outputs_)
+                print(f"Dropped columns {dropped} - ")
+                print(f"Used inputs: {transformer.used_inputs_} - ")
+            case ToFloat():
+                print("ToFloat - ")
+                print(f"Columns transformed to float: {transformer.used_inputs_} - ")
+            case ToDatetime():
+                print("ToDatetime - ")
+                print(f"Columns transformed to datetime: {transformer.used_inputs_} - ")
+
+
 class Cleaner(TransformerMixin, BaseEstimator):
     """Column-wise consistency checks and sanitization of dtypes, null values and dates.
 

From e93c7ec879f45068fb6845126a0b3ee78a50b0c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= <eloi.massoulie@inria.fr>
Date: Wed, 10 Jun 2026 14:25:21 +0200
Subject: [PATCH 2/4] Rough version of `list_transformations`

---
 skrub/_table_vectorizer.py | 91 ++++++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 8 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 05d034c41..82746d193 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -182,23 +182,37 @@ def _get_preprocessors(
 
 
 def _list_transformations(estimator):
+    message = ""
     for step in estimator._pipeline.named_steps:
         if step == "checkinputdataframe":
             continue
         transformer = estimator._pipeline.named_steps[step]
-        # "transformer" is just ApplyToEachCol, so we need to get the actual transformer
         match transformer.transformer:
             case DropUninformative():
-                print("DropUninformative - ")
                 dropped = set(transformer.all_inputs_) - set(transformer.all_outputs_)
-                print(f"Dropped columns {dropped} - ")
-                print(f"Used inputs: {transformer.used_inputs_} - ")
+                if dropped != set():
+                    message += "DropUninformative - " + "\n"
+                    message += f"Dropped columns {dropped}" + "\n"
+                    message += f"Used inputs: {transformer.used_inputs_} - " + "\n"
             case ToFloat():
-                print("ToFloat - ")
-                print(f"Columns transformed to float: {transformer.used_inputs_} - ")
+                message += "ToFloat - " + "\n"
+                message += (
+                    f"Columns transformed to float: {transformer.used_inputs_} - "
+                    + "\n"
+                )
             case ToDatetime():
-                print("ToDatetime - ")
-                print(f"Columns transformed to datetime: {transformer.used_inputs_} - ")
+                message += "ToDatetime - " + "\n"
+                message += (
+                    f"Columns transformed to datetime: {transformer.used_inputs_} - "
+                    + "\n"
+                )
+            case CleanNullStrings():
+                message += "CleanNullStrings - " + "\n"
+                message += (
+                    f"Columns with standardized nulls: {transformer.used_inputs_} - "
+                    + "\n"
+                )
+    return message
 
 
 class Cleaner(TransformerMixin, BaseEstimator):
@@ -558,6 +572,9 @@ def get_feature_names_out(self, input_features=None):
         check_is_fitted(self, "all_outputs_")
         return np.asarray(self.all_outputs_)
 
+    def list_transformations(self):
+        print(_list_transformations(self))
+
 
 class TableVectorizer(TransformerMixin, BaseEstimator):
     """Transform a dataframe to a numeric (vectorized) representation.
@@ -1184,3 +1201,61 @@ def get_feature_names_out(self, input_features=None):
         """
         check_is_fitted(self, "all_outputs_")
         return np.asarray(self.all_outputs_)
+
+    def list_transformations(self):
+        preprocessing_transformations = _list_transformations(self)
+        vectorize_transformations = ""
+        specific_transformations = "Specific transformations: " + "\n"
+
+        for step in self._pipeline.named_steps:
+            if step == "checkinputdataframe":
+                continue
+            transformer = self._pipeline.named_steps[step]
+            match transformer.transformer:
+                case type(self.numeric):
+                    vectorize_transformations += (
+                        f"Numeric transformer: {self.numeric}" + "\n"
+                    )
+                    vectorize_transformations += (
+                        f"Numerical columns transformed: {transformer.used_inputs}"
+                        + "\n"
+                    )
+                case type(self.datetime):
+                    vectorize_transformations += (
+                        f"Datetime transformer: {self.datetime}" + "\n"
+                    )
+                    vectorize_transformations += (
+                        f"Datetime columns transformed: {transformer.used_inputs}"
+                        + "\n"
+                    )
+                case type(self.low_cardinality):
+                    vectorize_transformations += (
+                        f"Low-cardinality transformer: {self.low_cardinality}" + "\n"
+                    )
+                    vectorize_transformations += (
+                        f"Low-cardinality columns transformed: \
+                            {transformer.used_inputs}"
+                        + "\n"
+                    )
+                case type(self.high_cardinality):
+                    vectorize_transformations += (
+                        f"High-cardinality transformer: {self.high_cardinality}" + "\n"
+                    )
+                    vectorize_transformations += (
+                        f"High-cardinality columns transformed: \
+                            {self.high_cardinality.used_inputs}"
+                        + "\n"
+                    )
+            if transformer.transformer in self.specific_transformers:
+                specific_transformations += (
+                    f"{transformer.transformer} applied to: {transformer.used_inputs}"
+                    + "\n"
+                )
+
+        return (
+            preprocessing_transformations
+            + "\n\n"
+            + vectorize_transformations
+            + "\n\n"
+            + specific_transformations
+        )

From 36a6e712925fdec6e80a3bf1b72db259b1585b42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= <eloi.massoulie@inria.fr>
Date: Mon, 15 Jun 2026 16:18:24 +0200
Subject: [PATCH 3/4] Removed pattern matching for custom transformers

---
 skrub/_table_vectorizer.py | 74 +++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 82746d193..c89166b70 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -1210,48 +1210,46 @@ def list_transformations(self):
         for step in self._pipeline.named_steps:
             if step == "checkinputdataframe":
                 continue
+
             transformer = self._pipeline.named_steps[step]
-            match transformer.transformer:
-                case type(self.numeric):
-                    vectorize_transformations += (
-                        f"Numeric transformer: {self.numeric}" + "\n"
-                    )
-                    vectorize_transformations += (
-                        f"Numerical columns transformed: {transformer.used_inputs}"
-                        + "\n"
-                    )
-                case type(self.datetime):
-                    vectorize_transformations += (
-                        f"Datetime transformer: {self.datetime}" + "\n"
-                    )
-                    vectorize_transformations += (
-                        f"Datetime columns transformed: {transformer.used_inputs}"
-                        + "\n"
-                    )
-                case type(self.low_cardinality):
-                    vectorize_transformations += (
-                        f"Low-cardinality transformer: {self.low_cardinality}" + "\n"
-                    )
-                    vectorize_transformations += (
-                        f"Low-cardinality columns transformed: \
-                            {transformer.used_inputs}"
-                        + "\n"
-                    )
-                case type(self.high_cardinality):
-                    vectorize_transformations += (
-                        f"High-cardinality transformer: {self.high_cardinality}" + "\n"
-                    )
-                    vectorize_transformations += (
-                        f"High-cardinality columns transformed: \
-                            {self.high_cardinality.used_inputs}"
-                        + "\n"
-                    )
+            trans_type = type(transformer.transformer)
+
+            if trans_type == type(self.numeric):
+                vectorize_transformations += (
+                    f"Numeric transformer: {self.numeric}" + "\n"
+                )
+                vectorize_transformations += (
+                    f"applied to numerical columns {transformer.used_inputs_}" + "\n"
+                )
+            elif trans_type == type(self.datetime):
+                vectorize_transformations += (
+                    f"Datetime transformer: {self.datetime}" + "\n"
+                )
+                vectorize_transformations += (
+                    f"applied to datetime columns {transformer.used_inputs_}" + "\n"
+                )
+            elif trans_type == type(self.low_cardinality):
+                vectorize_transformations += (
+                    f"Low-cardinality transformer: {self.low_cardinality}" + "\n"
+                )
+                vectorize_transformations += (
+                    f"applied to low-cardinality columns \
+                        {transformer.used_inputs_}"
+                    + "\n"
+                )
+            elif trans_type == type(self.high_cardinality):
+                vectorize_transformations += (
+                    f"High-cardinality transformer: {self.high_cardinality}" + "\n"
+                )
+                vectorize_transformations += (
+                    f"applied to high-cardinality columns \
+                        {transformer.used_inputs_}"
+                    + "\n"
+                )
             if transformer.transformer in self.specific_transformers:
                 specific_transformations += (
-                    f"{transformer.transformer} applied to: {transformer.used_inputs}"
-                    + "\n"
+                    f"{transformer} applied to: {transformer.used_inputs_}" + "\n"
                 )
-
         return (
             preprocessing_transformations
             + "\n\n"

From 1270c712caa09570d0a4647f975d2e30debebe0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= <eloi.massoulie@inria.fr>
Date: Fri, 19 Jun 2026 15:59:49 +0200
Subject: [PATCH 4/4] Introduced basic test and simplified transformer list

---
 skrub/_table_vectorizer.py           | 66 ++++++++++------------------
 skrub/tests/test_table_vectorizer.py | 13 ++++++
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index c89166b70..e83172b46 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -193,23 +193,22 @@ def _list_transformations(estimator):
                 if dropped != set():
                     message += "DropUninformative - " + "\n"
                     message += f"Dropped columns {dropped}" + "\n"
-                    message += f"Used inputs: {transformer.used_inputs_} - " + "\n"
+                    message += f"Used inputs: {transformer.used_inputs_}" + "\n"
             case ToFloat():
                 message += "ToFloat - " + "\n"
                 message += (
-                    f"Columns transformed to float: {transformer.used_inputs_} - "
-                    + "\n"
+                    f"Columns transformed to float: {transformer.used_inputs_}" + "\n"
                 )
             case ToDatetime():
                 message += "ToDatetime - " + "\n"
                 message += (
-                    f"Columns transformed to datetime: {transformer.used_inputs_} - "
+                    f"Columns transformed to datetime: {transformer.used_inputs_}"
                     + "\n"
                 )
             case CleanNullStrings():
                 message += "CleanNullStrings - " + "\n"
                 message += (
-                    f"Columns with standardized nulls: {transformer.used_inputs_} - "
+                    f"Columns with standardized nulls: {transformer.used_inputs_}"
                     + "\n"
                 )
     return message
@@ -573,7 +572,7 @@ def get_feature_names_out(self, input_features=None):
         return np.asarray(self.all_outputs_)
 
     def list_transformations(self):
-        print(_list_transformations(self))
+        return _list_transformations(self)
 
 
 class TableVectorizer(TransformerMixin, BaseEstimator):
@@ -1205,51 +1204,32 @@ def get_feature_names_out(self, input_features=None):
     def list_transformations(self):
         preprocessing_transformations = _list_transformations(self)
         vectorize_transformations = ""
-        specific_transformations = "Specific transformations: " + "\n"
-
-        for step in self._pipeline.named_steps:
-            if step == "checkinputdataframe":
-                continue
+        specific_transformations = ""
 
-            transformer = self._pipeline.named_steps[step]
-            trans_type = type(transformer.transformer)
+        all_transformers = self.kind_to_columns_
+        specific = all_transformers.pop("specific")
 
-            if trans_type == type(self.numeric):
-                vectorize_transformations += (
-                    f"Numeric transformer: {self.numeric}" + "\n"
-                )
-                vectorize_transformations += (
-                    f"applied to numerical columns {transformer.used_inputs_}" + "\n"
-                )
-            elif trans_type == type(self.datetime):
-                vectorize_transformations += (
-                    f"Datetime transformer: {self.datetime}" + "\n"
-                )
-                vectorize_transformations += (
-                    f"applied to datetime columns {transformer.used_inputs_}" + "\n"
-                )
-            elif trans_type == type(self.low_cardinality):
-                vectorize_transformations += (
-                    f"Low-cardinality transformer: {self.low_cardinality}" + "\n"
-                )
+        for transformer_type, transformer_cols in all_transformers.items():
+            if transformer_cols != []:
                 vectorize_transformations += (
-                    f"applied to low-cardinality columns \
-                        {transformer.used_inputs_}"
+                    f"{transformer_type} transformer is \
+                        {getattr(self, transformer_type)} \
+                            and was applied to {transformer_cols}."
                     + "\n"
                 )
-            elif trans_type == type(self.high_cardinality):
-                vectorize_transformations += (
-                    f"High-cardinality transformer: {self.high_cardinality}" + "\n"
-                )
+            else:
                 vectorize_transformations += (
-                    f"applied to high-cardinality columns \
-                        {transformer.used_inputs_}"
+                    f"{transformer_type} transformer is \
+                        {getattr(self, transformer_type)} \
+                            and was applied to nothing."
                     + "\n"
                 )
-            if transformer.transformer in self.specific_transformers:
-                specific_transformations += (
-                    f"{transformer} applied to: {transformer.used_inputs_}" + "\n"
-                )
+
+        if self.specific_transformers != ():
+            for t in self.specific_transformers:
+                specific_transformations += f"specific transformer \
+                        {t} was applied to {specific}"
+
         return (
             preprocessing_transformations
             + "\n\n"
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
index 43d438071..6a41d67e6 100644
--- a/skrub/tests/test_table_vectorizer.py
+++ b/skrub/tests/test_table_vectorizer.py
@@ -28,6 +28,7 @@
 from skrub._to_float import ToFloat
 from skrub._to_str import ToStr
 from skrub.conftest import _POLARS_INSTALLED
+from skrub.datasets._generating import toy_cities
 
 MSG_PANDAS_DEPRECATED_WARNING = "Skip deprecation warning"
 
@@ -1277,3 +1278,15 @@ def test_duration_to_float(df_module):
     vectorizer = Cleaner()
     transformed = vectorizer.fit_transform(df)
     df_module.assert_column_equal(transformed["duration"], df["duration"])
+
+
+def test_list_transformations(df_module):
+    df = toy_cities()
+
+    vectorizer = TableVectorizer()
+    _ = vectorizer.fit_transform(df)
+    _ = vectorizer.list_transformations()
+
+    vectorizer = Cleaner()
+    _ = vectorizer.fit_transform(df)
+    _ = vectorizer.list_transformations()