Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions airbyte_cdk/sources/file_based/config/unstructured_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ class LocalProcessingConfigModel(BaseModel):

class Config(OneOfOptionConfig):
title = "Local"
description = (
"Process files locally, supporting `fast` and `ocr` modes. This is the default option."
)
description = "Process files locally using MarkItDown. This is the default option."
discriminator = "mode"


class APIParameterConfigModel(BaseModel):
"""Deprecated: API processing is no longer supported. Retained for config backward compatibility."""

name: str = Field(
title="Parameter name",
description="The name of the unstructured API parameter to use",
Expand All @@ -32,40 +32,42 @@ class APIParameterConfigModel(BaseModel):


class APIProcessingConfigModel(BaseModel):
"""Deprecated: API processing is no longer supported. Retained for config backward compatibility."""

mode: Literal["api"] = Field("api", const=True)

api_key: str = Field(
default="",
always_show=True,
title="API Key",
airbyte_secret=True,
description="The API key to use matching the environment",
description="Deprecated: API processing is no longer supported.",
)

api_url: str = Field(
default="https://api.unstructured.io",
title="API URL",
always_show=True,
description="The URL of the unstructured API to use",
description="Deprecated: API processing is no longer supported.",
examples=["https://api.unstructured.com"],
)

parameters: Optional[List[APIParameterConfigModel]] = Field(
default=[],
always_show=True,
title="Additional URL Parameters",
description="List of parameters send to the API",
description="Deprecated: API processing is no longer supported.",
)

class Config(OneOfOptionConfig):
title = "via API"
description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured."
description = "Deprecated: API processing is no longer supported. All processing is now done locally using MarkItDown."
discriminator = "mode"


class UnstructuredFormat(BaseModel):
class Config(OneOfOptionConfig):
title = "Unstructured Document Format"
title = "Document Format"

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still "Unstructured" ("unstructured", lower case, to be specific) - just not using the (brand) "Unstructured".

Revert name to still call it Unstructured. Propose alternatives but don't make any changes to how we describe it functionally until we've discussed.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

☑️ Resolved in 484572b. Reverted unstructured_format.py to the original descriptions — title is back to "Unstructured Document Format" and all field descriptions are unchanged from main.

Happy to discuss naming alternatives when you're ready. Some options to consider:

  • Keep "Unstructured Document Format" as-is (current)
  • "Unstructured Document Format (powered by MarkItDown)" — adds clarity on the engine
  • "Document Format (unstructured)" — lowercase as you noted

No further description changes until we've discussed.


Devin session

description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
discriminator = "filetype"

Expand All @@ -87,7 +89,7 @@ class Config(OneOfOptionConfig):
default="auto",
title="Parsing Strategy",
enum=["auto", "fast", "ocr_only", "hi_res"],
description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
description="Deprecated: This field is ignored. All parsing is now handled by MarkItDown.",
)

processing: Union[
Expand All @@ -96,7 +98,7 @@ class Config(OneOfOptionConfig):
] = Field(
default=LocalProcessingConfigModel(mode="local"),
title="Processing",
description="Processing configuration",
description="Deprecated: All processing is now done locally using MarkItDown.",
discriminator="mode",
type="object",
)
Loading
Loading