From c3b6a30c3050ca2feedc0bc29e6c847f491e794e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:47:10 +0000 Subject: [PATCH 1/2] fix(file-based): support multi-sheet Excel workbooks via sheet_name config Add an optional sheet_name field to ExcelFormat config that controls which worksheet(s) the parser reads: - '0' (default): first sheet only (preserves existing behavior) - '': a specific sheet by name - '*': all sheets in the workbook Both Calamine and OpenPyXL fallback paths now pass sheet_name through to pandas. Schema inference merges columns across selected sheets. Co-Authored-By: bot_apk --- .../sources/file_based/config/excel_format.py | 5 + .../file_based/file_types/excel_parser.py | 78 +++++++++---- .../file_types/test_excel_parser.py | 105 +++++++++++++++++- 3 files changed, 162 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/file_based/config/excel_format.py b/airbyte_cdk/sources/file_based/config/excel_format.py index 632a0bc38..0dbc9c373 100644 --- a/airbyte_cdk/sources/file_based/config/excel_format.py +++ b/airbyte_cdk/sources/file_based/config/excel_format.py @@ -16,3 +16,8 @@ class Config(OneOfOptionConfig): "excel", const=True, ) + sheet_name: str = Field( + default="0", + title="Sheet Name", + description='The Excel worksheet to read. Use a sheet name, a zero-indexed position like "0", or "*" to read all sheets.', + ) diff --git a/airbyte_cdk/sources/file_based/file_types/excel_parser.py b/airbyte_cdk/sources/file_based/file_types/excel_parser.py index 93896f14f..905b4dc0d 100644 --- a/airbyte_cdk/sources/file_based/file_types/excel_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/excel_parser.py @@ -33,6 +33,7 @@ class ExcelParser(FileTypeParser): ENCODING = None + ALL_SHEETS = "*" def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]: """ @@ -62,18 +63,20 @@ async def infer_schema( # Validate the format of the config self.validate_format(config.format, logger) + excel_format = config.format + if not isinstance(excel_format, ExcelFormat): + raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) fields: Dict[str, str] = {} with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp, logger, file) - for column, df_type in df.dtypes.items(): - # Choose the broadest data type if the column's data type differs in dataframes - prev_frame_column_type = fields.get(column) # type: ignore [call-overload] - fields[column] = self.dtype_to_json_type( # type: ignore [index] - prev_frame_column_type, - df_type, - ) + for df in self._parse_excel_file(fp, excel_format, logger, file).values(): + for column, df_type in df.dtypes.items(): + prev_frame_column_type = fields.get(column) # type: ignore [call-overload] + fields[column] = self.dtype_to_json_type( # type: ignore [index] + prev_frame_column_type, + df_type, + ) schema = { field: ( @@ -109,18 +112,19 @@ def parse_records( # Validate the format of the config self.validate_format(config.format, logger) + excel_format = config.format + if not isinstance(excel_format, ExcelFormat): + raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) try: # Open and parse the file using the stream reader with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: - df = self.open_and_parse_file(fp, logger, file) - # Yield records as dictionaries - # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson - # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior - # see PR description: https://github.com/airbytehq/airbyte/pull/44444/ - yield from orjson.loads( - df.to_json(orient="records", date_format="iso", date_unit="us") - ) + for df in self._parse_excel_file(fp, excel_format, logger, file).values(): + # DataFrame.to_dict() returns pandas.Timestamp values not serializable by orjson. + # DataFrame.to_json() serializes datetimes to iso8601 with microseconds. + yield from orjson.loads( + df.to_json(orient="records", date_format="iso", date_unit="us") + ) except Exception as exc: # Raise a RecordParseError if any exception occurs during parsing @@ -187,7 +191,8 @@ def _open_and_parse_file_with_calamine( fp: Union[IOBase, str, Path], logger: logging.Logger, file: RemoteFile, - ) -> pd.DataFrame: + sheet_name: Union[int, str, None] = 0, + ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]: """Opens and parses Excel file using Calamine engine. Args: @@ -202,7 +207,7 @@ def _open_and_parse_file_with_calamine( ExcelCalamineParsingError: If Calamine fails to parse the file. """ try: - return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] + return pd.ExcelFile(fp, engine="calamine").parse(sheet_name=sheet_name) # type: ignore [arg-type, call-overload, no-any-return] except BaseException as exc: # Calamine engine raises PanicException(child of BaseException) if Calamine fails to parse the file. # Checking if ValueError in exception arg to know if it was actually an error during parsing due to invalid values in cells. @@ -222,7 +227,8 @@ def _open_and_parse_file_with_openpyxl( fp: Union[IOBase, str, Path], logger: logging.Logger, file: RemoteFile, - ) -> pd.DataFrame: + sheet_name: Union[int, str, None] = 0, + ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]: """Opens and parses Excel file using Openpyxl engine. Args: @@ -245,19 +251,20 @@ def _open_and_parse_file_with_openpyxl( with warnings.catch_warnings(record=True) as warning_records: warnings.simplefilter("always") - df = pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload] + dfs = pd.ExcelFile(fp, engine="openpyxl").parse(sheet_name=sheet_name) # type: ignore [arg-type, call-overload] for warning in warning_records: logger.warning(f"Openpyxl warning for {file.file_uri_for_logging}: {warning.message}") - return df # type: ignore [no-any-return] + return dfs # type: ignore [no-any-return] def open_and_parse_file( self, fp: Union[IOBase, str, Path], logger: logging.Logger, file: RemoteFile, - ) -> pd.DataFrame: + sheet_name: Union[int, str, None] = 0, + ) -> Union[pd.DataFrame, Dict[Union[int, str], pd.DataFrame]]: """Opens and parses the Excel file with Calamine-first and Openpyxl fallback. Args: @@ -269,6 +276,29 @@ def open_and_parse_file( pd.DataFrame: Parsed data from the Excel file. """ try: - return self._open_and_parse_file_with_calamine(fp, logger, file) + return self._open_and_parse_file_with_calamine(fp, logger, file, sheet_name) except ExcelCalamineParsingError: - return self._open_and_parse_file_with_openpyxl(fp, logger, file) + return self._open_and_parse_file_with_openpyxl(fp, logger, file, sheet_name) + + def _parse_excel_file( + self, + fp: Union[IOBase, str, Path], + excel_format: ExcelFormat, + logger: logging.Logger, + file: RemoteFile, + ) -> Dict[Union[int, str], pd.DataFrame]: + """Parses an Excel file and returns a dict of sheet name → DataFrame.""" + sheet_name = self._resolve_sheet_name(excel_format) + parsed = self.open_and_parse_file(fp, logger, file, sheet_name) + if isinstance(parsed, pd.DataFrame): + return {excel_format.sheet_name: parsed} + return parsed + + def _resolve_sheet_name(self, excel_format: ExcelFormat) -> Union[int, str, None]: + """Converts the string config value to a pandas-compatible `sheet_name` argument.""" + value = excel_format.sheet_name + if value == self.ALL_SHEETS: + return None + if value.isdecimal(): + return int(value) + return value diff --git a/unit_tests/sources/file_based/file_types/test_excel_parser.py b/unit_tests/sources/file_based/file_types/test_excel_parser.py index 18850e9b0..2a13b5598 100644 --- a/unit_tests/sources/file_based/file_types/test_excel_parser.py +++ b/unit_tests/sources/file_based/file_types/test_excel_parser.py @@ -3,6 +3,7 @@ # +import asyncio import datetime import warnings from io import BytesIO @@ -152,7 +153,7 @@ def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger): calamine_excel_file = MagicMock() - def calamine_parse_side_effect(): + def calamine_parse_side_effect(**kwargs): raise FakePanic( "failed to construct date: PyErr { type: , value: ValueError('year 20225 is out of range'), traceback: None }" ) @@ -161,7 +162,7 @@ def calamine_parse_side_effect(): openpyxl_excel_file = MagicMock() - def openpyxl_parse_side_effect(): + def openpyxl_parse_side_effect(**kwargs): warnings.warn("Cell A146 has invalid date", UserWarning) return fallback_df @@ -238,3 +239,103 @@ def seek(self, *args, **kwargs): assert "Could not rewind stream" in msg assert remote_file.file_uri_for_logging in msg mock_excel.assert_called_once_with(fp, engine="openpyxl") + openpyxl_excel_file.parse.assert_called_once_with(sheet_name=0) + + +def _make_multisheet_excel_bytes() -> bytes: + """Creates an in-memory Excel workbook with two sheets for testing.""" + buf = BytesIO() + with pd.ExcelWriter(buf, engine="xlsxwriter") as writer: + pd.DataFrame({"col_a": ["first"], "shared": [1]}).to_excel( + writer, index=False, sheet_name="First" + ) + pd.DataFrame({"col_b": [2.5], "shared": [2]}).to_excel( + writer, index=False, sheet_name="Second" + ) + return buf.getvalue() + + +def _stream_reader_for(excel_bytes: bytes) -> MagicMock: + reader = MagicMock(spec=AbstractFileBasedStreamReader) + reader.open_file.return_value = BytesIO(excel_bytes) + return reader + + +@pytest.mark.parametrize( + "sheet_name,expected_records", + [ + pytest.param( + "0", + [{"col_a": "first", "shared": 1}], + id="default_first_sheet", + ), + pytest.param( + "Second", + [{"col_b": 2.5, "shared": 2}], + id="sheet_by_name", + ), + pytest.param( + "*", + [{"col_a": "first", "shared": 1}, {"col_b": 2.5, "shared": 2}], + id="all_sheets", + ), + ], +) +def test_parse_records_selects_configured_sheet(sheet_name, expected_records, remote_file): + parser = ExcelParser() + config = FileBasedStreamConfig(name="test_stream", format=ExcelFormat(sheet_name=sheet_name)) + reader = _stream_reader_for(_make_multisheet_excel_bytes()) + + records = list(parser.parse_records(config, remote_file, reader, MagicMock())) + + assert records == expected_records + + +@pytest.mark.parametrize( + "sheet_name,expected_schema", + [ + pytest.param( + "0", + {"col_a": {"type": "string"}, "shared": {"type": "number"}}, + id="first_sheet_schema", + ), + pytest.param( + "*", + { + "col_a": {"type": "string"}, + "col_b": {"type": "number"}, + "shared": {"type": "number"}, + }, + id="all_sheets_merged_schema", + ), + ], +) +def test_infer_schema_with_sheet_selection(sheet_name, expected_schema, remote_file): + parser = ExcelParser() + config = FileBasedStreamConfig(name="test_stream", format=ExcelFormat(sheet_name=sheet_name)) + reader = _stream_reader_for(_make_multisheet_excel_bytes()) + + loop = asyncio.new_event_loop() + try: + schema = loop.run_until_complete( + parser.infer_schema(config, remote_file, reader, MagicMock()) + ) + finally: + loop.close() + + assert schema == expected_schema + + +@pytest.mark.parametrize( + "config_value,expected", + [ + pytest.param("0", 0, id="zero_index"), + pytest.param("1", 1, id="numeric_index"), + pytest.param("MySheet", "MySheet", id="named_sheet"), + pytest.param("*", None, id="all_sheets"), + ], +) +def test_resolve_sheet_name(config_value, expected): + parser = ExcelParser() + fmt = ExcelFormat(sheet_name=config_value) + assert parser._resolve_sheet_name(fmt) == expected From 57ca4e19a2632255fc3e66ced3b906ff2dfb2c03 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:57:21 +0000 Subject: [PATCH 2/2] fix: update expected spec snapshot with sheet_name field Co-Authored-By: bot_apk --- unit_tests/sources/file_based/scenarios/csv_scenarios.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/unit_tests/sources/file_based/scenarios/csv_scenarios.py index f16d83e20..730d0d727 100644 --- a/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -472,7 +472,13 @@ "default": "excel", "const": "excel", "type": "string", - } + }, + "sheet_name": { + "title": "Sheet Name", + "description": 'The Excel worksheet to read. Use a sheet name, a zero-indexed position like "0", or "*" to read all sheets.', + "default": "0", + "type": "string", + }, }, "required": ["filetype"], },