From c9d7b9e3d75f73d86806def6dae25ecc3c0e06a0 Mon Sep 17 00:00:00 2001 From: James Estevez Date: Thu, 14 May 2026 10:07:36 -0700 Subject: [PATCH 1/2] test(bulkdata): regression test for #3886 - select_utils import emits no WARNING Reproduces GH #3886: cumulusci/tasks/bulkdata/select_utils.py emits a logger.warning() at module-import time inside its try/except ImportError block for the optional [select] deps (numpy/pandas/annoy/scikit-learn). Since extract.py transitively imports select_utils via mapping_parser and step, every extract_dataset invocation surfaces the warning even when no select strategy is configured. The new test blocks numpy/pandas/annoy/sklearn via sys.modules sentinels, re-imports select_utils, and asserts no WARNING-level record is emitted by the module's own logger. Fails on dev source today; will pass once the warning is deferred to the point of need. --- .../tasks/bulkdata/tests/test_select_utils.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py index efa9502902..e1df25d2f4 100644 --- a/cumulusci/tasks/bulkdata/tests/test_select_utils.py +++ b/cumulusci/tasks/bulkdata/tests/test_select_utils.py @@ -1,3 +1,6 @@ +import logging +import sys + import pytest from cumulusci.tasks.bulkdata.select_utils import ( @@ -1042,3 +1045,73 @@ def test_split_and_filter_fields_multiple_unique_lookups(): assert ( select_fields == fields ) # No filtering applied since all components are unique + + +def test_select_utils_import_does_not_emit_warning_when_optional_deps_missing( + caplog, +): + """Regression test for GH #3886. + + Importing ``cumulusci.tasks.bulkdata.select_utils`` must not emit a + WARNING-level log record at module-load time, even when the optional + ``[select]`` deps (numpy/pandas/annoy/scikit-learn) are unavailable. + Two transitive imports pull ``select_utils`` into ``extract.py``, so a + module-import warning fires on every ``extract_dataset`` invocation + regardless of whether the user opted into a similarity strategy. + """ + blocked_top = {"numpy", "pandas", "annoy", "sklearn"} + + saved_modules = {} + for name in list(sys.modules): + top = name.split(".", 1)[0] + if top in blocked_top: + saved_modules[name] = sys.modules[name] + + saved_select_utils = sys.modules.pop("cumulusci.tasks.bulkdata.select_utils", None) + + from pydantic.v1 import class_validators as _v1_class_validators + + saved_func_refs = { + ref + for ref in _v1_class_validators._FUNCS + if ref.startswith("cumulusci.tasks.bulkdata.select_utils.") + } + + try: + for name in list(sys.modules): + top = name.split(".", 1)[0] + if top in blocked_top: + sys.modules[name] = None + for name in blocked_top: + sys.modules[name] = None + + _v1_class_validators._FUNCS -= saved_func_refs + + caplog.clear() + with caplog.at_level( + logging.DEBUG, logger="cumulusci.tasks.bulkdata.select_utils" + ): + import cumulusci.tasks.bulkdata.select_utils # noqa: F401 + + warning_records = [ + r + for r in caplog.records + if r.levelno >= logging.WARNING + and r.name == "cumulusci.tasks.bulkdata.select_utils" + ] + assert warning_records == [], ( + "Module import emitted WARNING-level log record(s) when optional " + "[select] deps are missing (#3886): " + f"{[r.getMessage() for r in warning_records]}" + ) + finally: + sys.modules.pop("cumulusci.tasks.bulkdata.select_utils", None) + for name in blocked_top: + sys.modules.pop(name, None) + for name, mod in saved_modules.items(): + sys.modules[name] = mod + _v1_class_validators._FUNCS |= saved_func_refs + if saved_select_utils is not None: + sys.modules["cumulusci.tasks.bulkdata.select_utils"] = saved_select_utils + else: + import cumulusci.tasks.bulkdata.select_utils # noqa: F401 From 50f2cff707326a33c5fd1895f82ba09464dfcf6d Mon Sep 17 00:00:00 2001 From: James Estevez Date: Thu, 14 May 2026 10:09:39 -0700 Subject: [PATCH 2/2] fix(bulkdata): #3886 - defer select optional-dep warning to point of need Previously, cumulusci/tasks/bulkdata/select_utils.py emitted a logger.warning() at module import time inside its try/except ImportError block for the optional [select] deps (numpy/pandas/annoy/scikit-learn). Because extract.py transitively imports select_utils via mapping_parser and step, the warning surfaced on every extract_dataset invocation even when the user had not configured any select strategy. Move the emission out of module import. The warning now fires from similarity_post_process() only when the high-volume branch (complexity_constant >= 1000) would have used the Annoy fast path but optional deps are unavailable - the exact code path that pays the perf penalty the warning describes. A module-level flag ensures the message is emitted at most once per process. Refs #3886. --- cumulusci/tasks/bulkdata/select_utils.py | 28 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py index 91f1546a87..6a71e4a56d 100644 --- a/cumulusci/tasks/bulkdata/select_utils.py +++ b/cumulusci/tasks/bulkdata/select_utils.py @@ -21,13 +21,31 @@ OPTIONAL_DEPENDENCIES_AVAILABLE = True except ImportError: + OPTIONAL_DEPENDENCIES_AVAILABLE = False + +_MISSING_OPTIONAL_DEPS_WARNING_EMITTED = False + + +def _warn_missing_optional_deps_once() -> None: + """Emit the optional-[select]-deps warning at most once per process. + + The warning previously fired at module import time, so every + ``extract_dataset`` invocation surfaced it via transitive imports even + when no similarity strategy was in use (#3886). Defer the emission to + the code path that actually pays the perf penalty. + """ + global _MISSING_OPTIONAL_DEPS_WARNING_EMITTED + if _MISSING_OPTIONAL_DEPS_WARNING_EMITTED: + return + _MISSING_OPTIONAL_DEPS_WARNING_EMITTED = True logger.warning( - f"Optional dependencies are missing. " - "Handling high volumes of records for the 'select' functionality will be significantly slower, " + "Optional dependencies are missing. " + "Handling high volumes of records for the 'select' functionality " + "will be significantly slower, " "as optimizations for this feature are currently disabled. " - f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n" + "To enable optimized performance, install all required dependencies " + f"using: {get_cci_upgrade_command()}[select]" ) - OPTIONAL_DEPENDENCIES_AVAILABLE = False class SelectStrategy(StrEnum): @@ -313,6 +331,8 @@ def similarity_post_process( insert_records = [] if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE: + if complexity_constant >= 1000 and not OPTIONAL_DEPENDENCIES_AVAILABLE: + _warn_missing_optional_deps_once() select_records, insert_records = levenshtein_post_process( load_records, query_records, fields, weights, threshold )