Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
90d7141
feat(file-based): bump unstructured to 0.18.x and use native markdown…
devin-ai-integration[bot] Jul 1, 2026
d362cc0
fix: add logging to empty ImportError handlers in _import_unstructured
devin-ai-integration[bot] Jul 1, 2026
f448c02
fix: use CSV content in invalid file type test scenarios
devin-ai-integration[bot] Jul 1, 2026
158d1d4
fix: use PDF magic bytes for corrupted file scenario
devin-ai-integration[bot] Jul 1, 2026
5a5044d
refactor: use normal imports for docx/pptx, keep only pdf guarded
devin-ai-integration[bot] Jul 1, 2026
6e23fab
fix: pass metadata_file_path for content detection and use monkeypatc…
devin-ai-integration[bot] Jul 1, 2026
23357ca
fix: restore PDF test scenarios and fail-loudly imports
devin-ai-integration[bot] Jul 1, 2026
7cf669b
fix: restore pdf extra and pin onnxruntime for Python 3.10 compat
devin-ai-integration[bot] Jul 1, 2026
2e6ddc2
fix: add onnxruntime to deptry DEP002 ignore list
devin-ai-integration[bot] Jul 1, 2026
5c55e38
fix: restore custom markdown rendering for parsing parity
devin-ai-integration[bot] Jul 1, 2026
ae389e3
fix(unstructured): add type annotations to fix mypy errors
devin-ai-integration[bot] Jul 1, 2026
d266710
fix: regenerate lockfile to avoid transitive dep bumps
devin-ai-integration[bot] Jul 1, 2026
50b15b9
fix: update pdfminer.six to 20260107 for unstructured 0.18.x compat
devin-ai-integration[bot] Jul 1, 2026
2b58f6b
fix: add else branch for unsupported file types in _read_file_locally
devin-ai-integration[bot] Jul 1, 2026
ad31932
fix: update scenario test expectations for unstructured 0.18.x
devin-ai-integration[bot] Jul 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 38 additions & 57 deletions airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,13 @@
import traceback
from datetime import datetime
from io import BytesIO, IOBase
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast

import backoff
import dpath
import nltk
import requests
from unstructured.file_utils.filetype import (
EXT_TO_FILETYPE,
FILETYPE_TO_MIMETYPE,
STR_TO_FILETYPE,
FileType,
detect_filetype,
)
from unstructured.file_utils.filetype import FileType, detect_filetype
from unstructured.staging.base import elements_from_dicts, elements_to_md

from airbyte_cdk.models import FailureType
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
Expand Down Expand Up @@ -80,19 +74,30 @@ def optional_decode(contents: Union[str, bytes]) -> str:


def _import_unstructured() -> None:
"""Dynamically imported as needed, due to slow import speed."""
"""Dynamically imported as needed, due to slow import speed.

DOCX and PPTX are declared extras (`unstructured[docx, pptx]`) and will
fail loudly if missing. PDF requires heavy optional deps (`torch`,
`unstructured-inference`) that connectors must opt into by installing
`unstructured[pdf]`, so its import is the only one guarded.
"""
global unstructured_partition_pdf
global unstructured_partition_docx
global unstructured_partition_pptx

from unstructured.partition.docx import partition_docx
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pptx import partition_pptx

# separate global variables to properly propagate typing
unstructured_partition_pdf = partition_pdf
unstructured_partition_docx = partition_docx
unstructured_partition_pptx = partition_pptx

try:
from unstructured.partition.pdf import partition_pdf

unstructured_partition_pdf = partition_pdf
except ImportError:
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
logging.debug("PDF partition module unavailable; install unstructured[pdf] to enable.")
Comment thread
aaronsteers marked this conversation as resolved.
Outdated


def user_error(e: Exception) -> bool:
"""
Expand Down Expand Up @@ -207,13 +212,6 @@ def _read_file(
logger: logging.Logger,
) -> str:
_import_unstructured()
if (
(not unstructured_partition_pdf)
or (not unstructured_partition_docx)
or (not unstructured_partition_pptx)
):
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
raise Exception("unstructured library is not available")

filetype: FileType | None = self._get_filetype(file_handle, remote_file)

Expand Down Expand Up @@ -335,7 +333,7 @@ def _read_file_remotely(

data = self._params_to_dict(format.parameters, strategy)

file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
file_data = {"files": ("filename", file_handle, filetype.mime_type)}

response = requests.post(
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
Expand All @@ -350,19 +348,12 @@ def _read_file_remotely(

json_response = response.json()

return self._render_markdown(json_response)
return elements_to_md(elements_from_dicts(json_response))

def _read_file_locally(
self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
) -> str:
Comment thread
aaronsteers marked this conversation as resolved.
_import_unstructured()
if (
(not unstructured_partition_pdf)
or (not unstructured_partition_docx)
or (not unstructured_partition_pptx)
):
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
raise Exception("unstructured library is not available")

file: Any = file_handle

Expand All @@ -373,19 +364,24 @@ def _read_file_locally(

try:
if filetype == FileType.PDF:
# for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
if not unstructured_partition_pdf:
raise ImportError("PDF partition dependencies are not installed")
file_handle.seek(0)
with BytesIO(file_handle.read()) as file:
file_handle.seek(0)
elements = unstructured_partition_pdf(file=file, strategy=strategy)
elif filetype == FileType.DOCX:
if not unstructured_partition_docx:
raise ImportError("DOCX partition dependencies are not installed")
elements = unstructured_partition_docx(file=file)
elif filetype == FileType.PPTX:
if not unstructured_partition_pptx:
raise ImportError("PPTX partition dependencies are not installed")
elements = unstructured_partition_pptx(file=file)
except Exception as e:
raise self._create_parse_error(remote_file, str(e))

return self._render_markdown([element.to_dict() for element in elements])
return elements_to_md(elements)

def _create_parse_error(
self,
Expand All @@ -405,8 +401,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
2. Use the file name if available
3. Use the file content
"""
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[remote_file.mime_type]
if remote_file.mime_type:
filetype_from_mime = FileType.from_mime_type(remote_file.mime_type)
if filetype_from_mime is not None:
return filetype_from_mime

# set name to none, otherwise unstructured will try to get the modified date from the local file system
if hasattr(file, "name"):
Expand All @@ -418,24 +416,26 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
file_type: FileType | None = None
try:
file_type = detect_filetype(
filename=remote_file.uri,
file_path=remote_file.uri,
)
except Exception:
# Path doesn't exist locally. Try something else...
pass

if file_type and file_type != FileType.UNK:
return file_type

type_based_on_content = detect_filetype(file=file)
type_based_on_content = detect_filetype(
file=cast(IO[bytes], file), metadata_file_path=remote_file.uri
)
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset

if type_based_on_content and type_based_on_content != FileType.UNK:
return type_based_on_content

extension = "." + remote_file.uri.split(".")[-1].lower()
if extension in EXT_TO_FILETYPE:
return EXT_TO_FILETYPE[extension]
filetype_from_ext = FileType.from_extension(extension)
if filetype_from_ext is not None:
return filetype_from_ext
Comment thread
coderabbitai[bot] marked this conversation as resolved.

return None

Expand All @@ -449,25 +449,6 @@ def _get_file_type_error_message(
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"

def _render_markdown(self, elements: List[Any]) -> str:
return "\n\n".join((self._convert_to_markdown(el) for el in elements))

def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
if dpath.get(el, "type") == "Title":
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
if not isinstance(category_depth, int):
category_depth = (
int(category_depth) if isinstance(category_depth, (str, float)) else 1
)
heading_str = "#" * category_depth
return f"{heading_str} {dpath.get(el, 'text')}"
elif dpath.get(el, "type") == "ListItem":
return f"- {dpath.get(el, 'text')}"
elif dpath.get(el, "type") == "Formula":
return f"```\n{dpath.get(el, 'text')}\n```"
else:
return str(dpath.get(el, "text", default=""))

@property
def file_read_mode(self) -> FileReadMode:
return FileReadMode.READ_BINARY
Expand Down
Loading
Loading