From c3b6a30c3050ca2feedc0bc29e6c847f491e794e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:47:10 +0000
Subject: [PATCH 1/2] fix(file-based): support multi-sheet Excel workbooks via
 sheet_name config

Add an optional sheet_name field to ExcelFormat config that controls
which worksheet(s) the parser reads:
- '0' (default): first sheet only (preserves existing behavior)
- '<name>': a specific sheet by name
- '*': all sheets in the workbook

Both Calamine and OpenPyXL fallback paths now pass sheet_name through
to pandas. Schema inference merges columns across selected sheets.

Co-Authored-By: bot_apk <apk@cognition.ai>
---
 .../sources/file_based/config/excel_format.py |   5 +
 .../file_based/file_types/excel_parser.py     |  78 +++++++++----
 .../file_types/test_excel_parser.py           | 105 +++++++++++++++++-
 3 files changed, 162 insertions(+), 26 deletions(-)
diff --git a/airbyte_cdk/sources/file_based/config/excel_format.py b/airbyte_cdk/sources/file_based/config/excel_format.py
index 632a0bc38..0dbc9c373 100644
--- a/airbyte_cdk/sources/file_based/config/excel_format.py
+++ b/airbyte_cdk/sources/file_based/config/excel_format.py
@@ -16,3 +16,8 @@ class Config(OneOfOptionConfig):
         "excel",
         const=True,
     )
+    sheet_name: str = Field(
+        default="0",
+        title="Sheet Name",
+        description='The Excel worksheet to read. Use a sheet name, a zero-indexed position like "0", or "*" to read all sheets.',
+    )
diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py
index 93896f14f..905b4dc0d 100644
--- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py
@@ -33,6 +33,7 @@
 
 class ExcelParser(FileTypeParser):
     ENCODING = None
+    ALL_SHEETS = "*"
 
     def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
         """
@@ -62,18 +63,20 @@ async def infer_schema(
 
         # Validate the format of the config
         self.validate_format(config.format, logger)
+        excel_format = config.format
+        if not isinstance(excel_format, ExcelFormat):
+            raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
 
         fields: Dict[str, str] = {}
 
         with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
-            df = self.open_and_parse_file(fp, logger, file)
-            for column, df_type in df.dtypes.items():
-                # Choose the broadest data type if the column's data type differs in dataframes
-                prev_frame_column_type = fields.get(column)  # type: ignore [call-overload]
-                fields[column] = self.dtype_to_json_type(  # type: ignore [index]
-                    prev_frame_column_type,
-                    df_type,
-                )
+            for df in self._parse_excel_file(fp, excel_format, logger, file).values():
+                for column, df_type in df.dtypes.items():
+                    prev_frame_column_type = fields.get(column)  # type: ignore [call-overload]
+                    fields[column] = self.dtype_to_json_type(  # type: ignore [index]
+                        prev_frame_column_type,
+                        df_type,
+                    )
 
         schema = {
             field: (
@@ -109,18 +112,19 @@ def parse_records(
 
         # Validate the format of the config
         self.validate_format(config.format, logger)
+        excel_format = config.format
+        if not isinstance(excel_format, ExcelFormat):
+            raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
 
         try:
             # Open and parse the file using the stream reader
             with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
-                df = self.open_and_parse_file(fp, logger, file)
-                # Yield records as dictionaries
-                # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
-                # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
-                # see PR description: https://github.com/airbytehq/airbyte/pull/44444/
-                yield from orjson.loads(
-                    df.to_json(orient="records", date_format="iso", date_unit="us")
-                )
+                for df in self._parse_excel_file(fp, excel_format, logger, file).values():
+                    # DataFrame.to_dict() returns pandas.Timestamp values not serializable by orjson.
+                    # DataFrame.to_json() serializes datetimes to iso8601 with microseconds.
+                    yield from orjson.loads(
+                        df.to_json(orient="records", date_format="iso", date_unit="us")
+                    )
 
         except Exception as exc:
             # Raise a RecordParseError if any exception occurs during parsing
@@ -187,7 +191,8 @@ def _open_and_parse_file_with_calamine(
         fp: Union[IOBase, str, Path],
         logger: logging.Logger,
         file: RemoteFile,
-    ) -> pd.DataFrame:
+        sheet_name: Union[int, str, None] = 0,
+    ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]:
         """Opens and parses Excel file using Calamine engine.
 
         Args:
@@ -202,7 +207,7 @@ def _open_and_parse_file_with_calamine(
             ExcelCalamineParsingError: If Calamine fails to parse the file.
         """
         try:
-            return pd.ExcelFile(fp, engine="calamine").parse()  # type: ignore [arg-type, call-overload, no-any-return]
+            return pd.ExcelFile(fp, engine="calamine").parse(sheet_name=sheet_name)  # type: ignore [arg-type, call-overload, no-any-return]
         except BaseException as exc:
             # Calamine engine raises PanicException(child of BaseException) if Calamine fails to parse the file.
             # Checking if ValueError in exception arg to know if it was actually an error during parsing due to invalid values in cells.
@@ -222,7 +227,8 @@ def _open_and_parse_file_with_openpyxl(
         fp: Union[IOBase, str, Path],
         logger: logging.Logger,
         file: RemoteFile,
-    ) -> pd.DataFrame:
+        sheet_name: Union[int, str, None] = 0,
+    ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]:
         """Opens and parses Excel file using Openpyxl engine.
 
         Args:
@@ -245,19 +251,20 @@ def _open_and_parse_file_with_openpyxl(
 
         with warnings.catch_warnings(record=True) as warning_records:
             warnings.simplefilter("always")
-            df = pd.ExcelFile(fp, engine="openpyxl").parse()  # type: ignore [arg-type, call-overload]
+            dfs = pd.ExcelFile(fp, engine="openpyxl").parse(sheet_name=sheet_name)  # type: ignore [arg-type, call-overload]
 
         for warning in warning_records:
             logger.warning(f"Openpyxl warning for {file.file_uri_for_logging}: {warning.message}")
 
-        return df  # type: ignore [no-any-return]
+        return dfs  # type: ignore [no-any-return]
 
     def open_and_parse_file(
         self,
         fp: Union[IOBase, str, Path],
         logger: logging.Logger,
         file: RemoteFile,
-    ) -> pd.DataFrame:
+        sheet_name: Union[int, str, None] = 0,
+    ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]:
         """Opens and parses the Excel file with Calamine-first and Openpyxl fallback.
 
         Args:
@@ -269,6 +276,29 @@ def open_and_parse_file(
             pd.DataFrame: Parsed data from the Excel file.
         """
         try:
-            return self._open_and_parse_file_with_calamine(fp, logger, file)
+            return self._open_and_parse_file_with_calamine(fp, logger, file, sheet_name)
         except ExcelCalamineParsingError:
-            return self._open_and_parse_file_with_openpyxl(fp, logger, file)
+            return self._open_and_parse_file_with_openpyxl(fp, logger, file, sheet_name)
+
+    def _parse_excel_file(
+        self,
+        fp: Union[IOBase, str, Path],
+        excel_format: ExcelFormat,
+        logger: logging.Logger,
+        file: RemoteFile,
+    ) -> Dict[Union[int, str], pd.DataFrame]:
+        """Parses an Excel file and returns a dict of sheet name → DataFrame."""
+        sheet_name = self._resolve_sheet_name(excel_format)
+        parsed = self.open_and_parse_file(fp, logger, file, sheet_name)
+        if isinstance(parsed, pd.DataFrame):
+            return {excel_format.sheet_name: parsed}
+        return parsed
+
+    def _resolve_sheet_name(self, excel_format: ExcelFormat) -> Union[int, str, None]:
+        """Converts the string config value to a pandas-compatible `sheet_name` argument."""
+        value = excel_format.sheet_name
+        if value == self.ALL_SHEETS:
+            return None
+        if value.isdecimal():
+            return int(value)
+        return value
diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py
index 18850e9b0..2a13b5598 100644
--- a/unit_tests/sources/file_based/file_types/test_excel_parser.py
+++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py
@@ -3,6 +3,7 @@
 #
 
 
+import asyncio
 import datetime
 import warnings
 from io import BytesIO
@@ -152,7 +153,7 @@ def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger):
 
     calamine_excel_file = MagicMock()
 
-    def calamine_parse_side_effect():
+    def calamine_parse_side_effect(**kwargs):
         raise FakePanic(
             "failed to construct date: PyErr { type: <class 'ValueError'>, value: ValueError('year 20225 is out of range'), traceback: None }"
         )
@@ -161,7 +162,7 @@ def calamine_parse_side_effect():
 
     openpyxl_excel_file = MagicMock()
 
-    def openpyxl_parse_side_effect():
+    def openpyxl_parse_side_effect(**kwargs):
         warnings.warn("Cell A146 has invalid date", UserWarning)
         return fallback_df
 
@@ -238,3 +239,103 @@ def seek(self, *args, **kwargs):
     assert "Could not rewind stream" in msg
     assert remote_file.file_uri_for_logging in msg
     mock_excel.assert_called_once_with(fp, engine="openpyxl")
+    openpyxl_excel_file.parse.assert_called_once_with(sheet_name=0)
+
+
+def _make_multisheet_excel_bytes() -> bytes:
+    """Creates an in-memory Excel workbook with two sheets for testing."""
+    buf = BytesIO()
+    with pd.ExcelWriter(buf, engine="xlsxwriter") as writer:
+        pd.DataFrame({"col_a": ["first"], "shared": [1]}).to_excel(
+            writer, index=False, sheet_name="First"
+        )
+        pd.DataFrame({"col_b": [2.5], "shared": [2]}).to_excel(
+            writer, index=False, sheet_name="Second"
+        )
+    return buf.getvalue()
+
+
+def _stream_reader_for(excel_bytes: bytes) -> MagicMock:
+    reader = MagicMock(spec=AbstractFileBasedStreamReader)
+    reader.open_file.return_value = BytesIO(excel_bytes)
+    return reader
+
+
+@pytest.mark.parametrize(
+    "sheet_name,expected_records",
+    [
+        pytest.param(
+            "0",
+            [{"col_a": "first", "shared": 1}],
+            id="default_first_sheet",
+        ),
+        pytest.param(
+            "Second",
+            [{"col_b": 2.5, "shared": 2}],
+            id="sheet_by_name",
+        ),
+        pytest.param(
+            "*",
+            [{"col_a": "first", "shared": 1}, {"col_b": 2.5, "shared": 2}],
+            id="all_sheets",
+        ),
+    ],
+)
+def test_parse_records_selects_configured_sheet(sheet_name, expected_records, remote_file):
+    parser = ExcelParser()
+    config = FileBasedStreamConfig(name="test_stream", format=ExcelFormat(sheet_name=sheet_name))
+    reader = _stream_reader_for(_make_multisheet_excel_bytes())
+
+    records = list(parser.parse_records(config, remote_file, reader, MagicMock()))
+
+    assert records == expected_records
+
+
+@pytest.mark.parametrize(
+    "sheet_name,expected_schema",
+    [
+        pytest.param(
+            "0",
+            {"col_a": {"type": "string"}, "shared": {"type": "number"}},
+            id="first_sheet_schema",
+        ),
+        pytest.param(
+            "*",
+            {
+                "col_a": {"type": "string"},
+                "col_b": {"type": "number"},
+                "shared": {"type": "number"},
+            },
+            id="all_sheets_merged_schema",
+        ),
+    ],
+)
+def test_infer_schema_with_sheet_selection(sheet_name, expected_schema, remote_file):
+    parser = ExcelParser()
+    config = FileBasedStreamConfig(name="test_stream", format=ExcelFormat(sheet_name=sheet_name))
+    reader = _stream_reader_for(_make_multisheet_excel_bytes())
+
+    loop = asyncio.new_event_loop()
+    try:
+        schema = loop.run_until_complete(
+            parser.infer_schema(config, remote_file, reader, MagicMock())
+        )
+    finally:
+        loop.close()
+
+    assert schema == expected_schema
+
+
+@pytest.mark.parametrize(
+    "config_value,expected",
+    [
+        pytest.param("0", 0, id="zero_index"),
+        pytest.param("1", 1, id="numeric_index"),
+        pytest.param("MySheet", "MySheet", id="named_sheet"),
+        pytest.param("*", None, id="all_sheets"),
+    ],
+)
+def test_resolve_sheet_name(config_value, expected):
+    parser = ExcelParser()
+    fmt = ExcelFormat(sheet_name=config_value)
+    assert parser._resolve_sheet_name(fmt) == expected

From 57ca4e19a2632255fc3e66ced3b906ff2dfb2c03 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:57:21 +0000
Subject: [PATCH 2/2] fix: update expected spec snapshot with sheet_name field

Co-Authored-By: bot_apk <apk@cognition.ai>
---
 unit_tests/sources/file_based/scenarios/csv_scenarios.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py
index f16d83e20..730d0d727 100644
--- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py
+++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py
@@ -472,7 +472,13 @@
                                                     "default": "excel",
                                                     "const": "excel",
                                                     "type": "string",
-                                                }
+                                                },
+                                                "sheet_name": {
+                                                    "title": "Sheet Name",
+                                                    "description": 'The Excel worksheet to read. Use a sheet name, a zero-indexed position like "0", or "*" to read all sheets.',
+                                                    "default": "0",
+                                                    "type": "string",
+                                                },
                                             },
                                             "required": ["filetype"],
                                         },