diff --git a/.gitignore b/.gitignore index ddd8cd7d2..fc9537593 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,25 @@ doc/CHANGES.rst doc/RELEASE_PROCESS.rst doc/CONTRIBUTING.rst doc/sg_execution_times.rst +# RST content files synced from skrub/_docs at build time (conf.py) +doc/about.rst +doc/column_level_featurizing.rst +doc/data_ops.rst +doc/default_wrangling.rst +doc/development.rst +doc/documentation.rst +doc/exploring_a_dataframe.rst +doc/howto.rst +doc/index.rst +doc/install.rst +doc/joining_dataframes.rst +doc/learning_materials.rst +doc/multi_column_operations.rst +doc/tutorial_example.rst +doc/vision.rst +doc/guides/ +doc/modules/ +doc/tutorials/ .DS_Store doc/_templates/demo_table_report_generated.html doc/reference/*.rst diff --git a/CHANGES.rst b/CHANGES.rst index 7c87367fa..dadbfe8ba 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -65,6 +65,11 @@ Changes :pr:`2048` by :user:`Riccardo Cappuzzo `. - The minimum required version of matplotlib has been increased from 3.4.3 to 3.6.1. :pr:`2159` by :user:`Riccardo Cappuzzo `. +- The package build has been updated to include the user guide and examples with + the package, so that it is now possible to access it directly from the wheel + rather than having to rely on the online docs. Docs and examples are now stored + in ``skrub/_docs``, rather than in the root of the repository. + :pr:`2173` by :user:`Riccardo Cappuzzo `. Bugfixes -------- diff --git a/README.rst b/README.rst index f8322a91e..a959ecaa7 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,7 @@ skrub .. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg -**skrub** (formerly *dirty_cat*) is a Python -library that facilitates machine learning with dataframes. +**skrub** is a Python library that facilitates machine learning with dataframes. If you like the package, spread the word and ⭐ this repository! You can also join the `Discord server `_. @@ -28,6 +27,14 @@ Website: https://skrub-data.org/ See our `examples `_, or check out the `learning materials `_. +Documentation and examples are bundled with the package itself, in +``skrub/_docs``. After installing, you can find it at: + +.. code-block:: python + + import skrub + print(skrub.__docs_dir__) + Installation ------------ diff --git a/doc/Makefile b/doc/Makefile index 695de599b..5a6982785 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -29,24 +29,40 @@ html: rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + # Build markdown sources so llms.txt links point to .md files + SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -b markdown $(ALLSPHINXOPTS) $(BUILDDIR)/markdown + cp -r $(BUILDDIR)/markdown/. $(BUILDDIR)/html/_sources/ @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." html-noplot: - SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + SKB_TABLE_REPORT_VERBOSITY=0 SKIP_JUPYTERLITE=1 $(SPHINXBUILD) -D markdown_uri_doc_suffix="html.md" -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + # Build markdown sources so llms.txt links point to .md files + SKB_TABLE_REPORT_VERBOSITY=0 SKIP_JUPYTERLITE=1 $(SPHINXBUILD) -D plot_gallery=0 -b markdown $(ALLSPHINXOPTS) $(BUILDDIR)/markdown + cp -r $(BUILDDIR)/markdown/. $(BUILDDIR)/html/_sources/ @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Linkcheck finished. Results are in $(BUILDDIR)/linkcheck." linkcheck-noplot: - $(SPHINXBUILD) -D plot_gallery=0 -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck-noplot + SKB_TABLE_REPORT_VERBOSITY=0 SKIP_JUPYTERLITE=1 $(SPHINXBUILD) -D plot_gallery=0 -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck-noplot @echo @echo "Linkcheck (no plot) finished. Results are in $(BUILDDIR)/linkcheck-noplot." +markdown: + SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -b markdown $(ALLSPHINXOPTS) $(BUILDDIR)/markdown + @echo + @echo "Markdown build finished. The markdown files are in $(BUILDDIR)/markdown." + +markdown-noplot: + SKB_TABLE_REPORT_VERBOSITY=0 SKIP_JUPYTERLITE=1 $(SPHINXBUILD) -D plot_gallery=0 -b markdown $(ALLSPHINXOPTS) $(BUILDDIR)/markdown + @echo + @echo "Markdown build (no plot) finished. The markdown files are in $(BUILDDIR)/markdown." + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/doc/conf.py b/doc/conf.py index aa18871ad..9094b0db8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -24,6 +24,13 @@ import jinja2 +# Allow skipping jupyterlite to speed up builds (e.g. html-noplot) +_SKIP_JUPYTERLITE = os.environ.get("SKIP_JUPYTERLITE", "").strip() in ( + "1", + "true", + "yes", +) + # Generate the table report html file for the homepage sys.path.append(os.path.relpath(".")) from data_ops_report import create_data_ops_report @@ -43,14 +50,33 @@ from github_link import make_linkcode_resolve from sphinx_gallery.notebook import add_code_cell, add_markdown_cell -# -- Copy files for docs -------------------------------------------------- +# -- Sync documentation source files from skrub/_docs -------------------- # -# We avoid duplicating the information, but we do not use symlinks to be -# able to build the docs on Windows +# skrub/_docs is the single source of truth for all guide/content RST files +# so they are packaged with the wheel. We copy them into doc/ at build time +# rather than using symlinks (to support Windows builds). +# +# CHANGES.rst, CONTRIBUTING.rst and RELEASE_PROCESS.rst are canonical in the +# project root and are NOT stored in skrub/_docs. shutil.copyfile("../RELEASE_PROCESS.rst", "RELEASE_PROCESS.rst") shutil.copyfile("../CHANGES.rst", "CHANGES.rst") shutil.copyfile("../CONTRIBUTING.rst", "CONTRIBUTING.rst") +_docs_src = Path("../skrub/_docs") + +# Copy top-level RST content files +_skip_toplevel = {"CHANGES.rst", "CONTRIBUTING.rst", "RELEASE_PROCESS.rst"} +for _rst_file in _docs_src.glob("*.rst"): + if _rst_file.name not in _skip_toplevel: + shutil.copyfile(_rst_file, _rst_file.name) + +# Copy content subdirectories (guides, modules) +for _subdir in ["guides", "modules"]: + shutil.copytree(_docs_src / _subdir, _subdir, dirs_exist_ok=True) + +# Copy tutorials source files for sphinx-gallery +shutil.copytree(_docs_src / "tutorials", "tutorials", dirs_exist_ok=True) + # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -76,8 +102,14 @@ "sphinx_copybutton", "sphinx_gallery.gen_gallery", "autoshortsummary", + "sphinx_llms_txt", + "sphinx_markdown_builder", ] +# -- sphinx-llms-txt configuration ------------------------------------------- +# Link to Markdown sources in _sources/ (generated by the markdown builder). +llms_txt_uri_template = "{base_url}_sources/{docname}.md" + try: import sphinxext.opengraph # noqa @@ -85,18 +117,21 @@ except ImportError: print("ERROR: sphinxext.opengraph import failed") -try: - import jupyterlite_sphinx # noqa: F401 - - extensions.append("jupyterlite_sphinx") - with_jupyterlite = True -except ImportError: - # In some cases we don't want to require jupyterlite_sphinx to be installed, - # e.g. the doc-min-dependencies build - warnings.warn( - "jupyterlite_sphinx is not installed, you need to install it " - "if you want JupyterLite links to appear in each example" - ) +if not _SKIP_JUPYTERLITE: + try: + import jupyterlite_sphinx # noqa: F401 + + extensions.append("jupyterlite_sphinx") + with_jupyterlite = True + except ImportError: + # In some cases we don't want to require jupyterlite_sphinx to be installed, + # e.g. the doc-min-dependencies build + warnings.warn( + "jupyterlite_sphinx is not installed, you need to install it " + "if you want JupyterLite links to appear in each example" + ) + with_jupyterlite = False +else: with_jupyterlite = False import sphinx_autosummary_accessors @@ -480,7 +515,7 @@ def call_garbage_collector(gallery_conf, fname): # See https://sphinx-gallery.github.io/stable/configuration.html#link-to-documentation # noqa }, "filename_pattern": ".*", - "examples_dirs": ["../examples", "tutorials"], + "examples_dirs": ["../skrub/_docs/examples", "tutorials"], "gallery_dirs": ["auto_examples", "auto_tutorials"], "within_subsection_order": FileNameSortKey, "download_all_examples": False, diff --git a/pixi.lock b/pixi.lock index 04daec327..0c6b7e4e8 100644 --- a/pixi.lock +++ b/pixi.lock @@ -565,7 +565,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py314h3de4e8d_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.8.1-hecca717_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda @@ -606,7 +606,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda @@ -645,7 +645,7 @@ environments: - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/numpy/2.6.0.dev0/numpy-2.6.0.dev0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pandas/2.3.3+13.gb640e985cb/pandas-2.3.3+13.gb640e985cb.tar.gz - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pillow/12.3.0.dev0/pillow-12.3.0.dev0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-manylinux_2_28_x86_64.whl + - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-manylinux_2_28_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn/1.10.dev0/scikit_learn-1.10.dev0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scipy/1.19.0.dev0/scipy-1.19.0.dev0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl osx-64: @@ -671,7 +671,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda @@ -691,7 +691,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/brotli-python-1.2.0-py314h3262eb8_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/icu-78.3-h25d91c4_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libcxx-22.1.8-h19cb2f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.8.1-hcc62823_1.conda @@ -728,7 +728,7 @@ environments: - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/numpy/2.6.0.dev0/numpy-2.6.0.dev0-cp314-cp314-macosx_10_15_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pandas/2.3.3+13.gb640e985cb/pandas-2.3.3+13.gb640e985cb.tar.gz - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pillow/12.3.0.dev0/pillow-12.3.0.dev0-cp314-cp314-macosx_10_15_x86_64.whl - - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-macosx_12_0_x86_64.whl + - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-macosx_12_0_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn/1.10.dev0/scikit_learn-1.10.dev0-cp314-cp314-macosx_10_15_x86_64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scipy/1.19.0.dev0/scipy-1.19.0.dev0-cp314-cp314-macosx_10_15_x86_64.whl osx-arm64: @@ -754,7 +754,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda @@ -774,7 +774,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.2.0-py314h3daef5d_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/icu-78.3-hef89b57_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libcxx-22.1.8-h55c6f16_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libexpat-2.8.1-hf6b4638_1.conda @@ -811,7 +811,7 @@ environments: - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/numpy/2.6.0.dev0/numpy-2.6.0.dev0-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pandas/2.3.3+13.gb640e985cb/pandas-2.3.3+13.gb640e985cb.tar.gz - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pillow/12.3.0.dev0/pillow-12.3.0.dev0-cp314-cp314-macosx_11_0_arm64.whl - - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-macosx_12_0_arm64.whl + - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-macosx_12_0_arm64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn/1.10.dev0/scikit_learn-1.10.dev0-cp314-cp314-macosx_12_0_arm64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scipy/1.19.0.dev0/scipy-1.19.0.dev0-cp314-cp314-macosx_12_0_arm64.whl win-64: @@ -837,7 +837,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda @@ -858,7 +858,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/win_inet_pton-1.1.0-pyh7428d3b_8.conda - conda: https://conda.anaconda.org/conda-forge/win-64/brotli-python-1.2.0-py314he701e3d_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libexpat-2.8.1-hac47afa_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libffi-3.5.2-h3d046cb_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.3-hfd05255_0.conda @@ -895,7 +895,7 @@ environments: - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/numpy/2.6.0.dev0/numpy-2.6.0.dev0-cp314-cp314-win_amd64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pandas/2.3.3+13.gb640e985cb/pandas-2.3.3+13.gb640e985cb.tar.gz - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pillow/12.3.0.dev0/pillow-12.3.0.dev0-cp314-cp314-win_amd64.whl - - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-win_amd64.whl + - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-win_amd64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn/1.10.dev0/scikit_learn-1.10.dev0-cp314-cp314-win_amd64.whl - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scipy/1.19.0.dev0/scipy-1.19.0.dev0-cp314-cp314-win_amd64.whl ci-py310-min-deps: @@ -917,7 +917,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py310h3788b33_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py310h3406613_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py310h3406613_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/epoxy-1.5.10-hb03c661_2.conda @@ -1095,7 +1095,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1152,7 +1152,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1183,7 +1183,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h7656bdc_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.2-py310hf166250_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py310h399bfa0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py310h399bfa0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fonttools-4.63.0-py310h399bfa0_0.conda @@ -1289,7 +1289,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1320,7 +1320,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.2-py310h7f4e7e6_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py310hb46c203_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py310hb46c203_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fonttools-4.63.0-py310hb46c203_0.conda @@ -1426,7 +1426,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1457,7 +1457,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.2-py310hc19bc0b_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py310hdb0e946_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py310hdb0e946_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fonttools-4.63.0-py310hdb0e946_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -1589,7 +1589,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py310h3788b33_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py310h3406613_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py310h3406613_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hd9c7081_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/epoxy-1.5.10-hb03c661_2.conda @@ -1812,7 +1812,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1877,7 +1877,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -1924,7 +1924,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/c-ares-1.34.6-hb5e19a0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h950ec3b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.2-py310hf166250_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py310h399bfa0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py310h399bfa0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fonttools-4.63.0-py310h399bfa0_0.conda @@ -2073,7 +2073,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -2120,7 +2120,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/c-ares-1.34.6-hc919400_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-h6a3b0d2_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.2-py310h7f4e7e6_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py310hb46c203_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py310hb46c203_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fonttools-4.63.0-py310hb46c203_0.conda @@ -2269,7 +2269,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -2316,7 +2316,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/c-ares-1.34.6-hfd05255_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.2-py310hc19bc0b_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py310hdb0e946_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py310hdb0e946_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fonttools-4.63.0-py310hdb0e946_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -2485,7 +2485,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py311h724c32c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py311h3778330_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py311h3778330_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.4.0-hecca717_0.conda @@ -2733,7 +2733,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -2825,7 +2825,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -2886,7 +2886,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/c-ares-1.34.6-hc919400_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py311h7d85929_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py311hc290fe0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py311hc290fe0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fmt-12.1.0-h403dcb5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda @@ -3078,7 +3078,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -3135,7 +3135,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/c-ares-1.34.6-hfd05255_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py311h275cad7_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py311h3f79411_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py311h3f79411_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fmt-12.1.0-h7f4e812_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda @@ -3291,7 +3291,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py314h97ea11e_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.4.0-hecca717_0.conda @@ -3457,7 +3457,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -3517,7 +3517,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -3547,7 +3547,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h7656bdc_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.3-py314h22a2ed9_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/freetype-2.14.3-h694c41f_1.conda @@ -3660,7 +3660,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -3690,7 +3690,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py314hf8a3a22_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/freetype-2.14.3-hce30654_1.conda @@ -3802,7 +3802,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -3833,7 +3833,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py314hf309875_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -3961,7 +3961,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py314h97ea11e_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.4.0-hecca717_0.conda @@ -3988,11 +3988,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb646d72_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb642ee7_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-8_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda @@ -4025,8 +4025,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-devel-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_19.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.5.0-h8d2ee43_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.5.0-hdbdcf42_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.6.0-h8d2ee43_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.6.0-hdbdcf42_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.78.1-h1d1128b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.4.1-hb03c661_0.conda @@ -4040,7 +4040,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.27.0-h9692893_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.27.0-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.19-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.58-h421ea60_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpq-18.4-hd5a49e9_0.conda @@ -4175,7 +4175,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -4248,7 +4248,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -4301,7 +4301,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/c-ares-1.34.6-hb5e19a0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h7656bdc_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.3-py314h22a2ed9_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/freetype-2.14.3-h694c41f_1.conda @@ -4323,11 +4323,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/lcms2-2.19.1-h5ea7634_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/lerc-4.1.0-h35c7297_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libabseil-20260107.1-cxx17_h7ed6875_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-hf9fdb71_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-haea8852_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libblas-3.11.0-8_he492b99_openblas.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libbrotlicommon-1.2.0-h8616949_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libbrotlidec-1.2.0-h8616949_1.conda @@ -4349,8 +4349,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/libgfortran-15.2.0-h7e5c614_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libgfortran5-15.2.0-hd16e46c_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libglib-2.88.1-hf28f236_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.5.0-h8b848e0_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.5.0-hea209c6_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.6.0-h8b848e0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.6.0-hea209c6_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libgrpc-1.78.1-h147dede_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libiconv-1.18-h57a12c2_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libintl-0.25.1-h3184127_1.conda @@ -4362,7 +4362,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/libopenblas-0.3.33-openmp_h9e49c7b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libopentelemetry-cpp-1.27.0-h7a0a166_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libopentelemetry-cpp-headers-1.27.0-h694c41f_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libpng-1.6.58-he930e7c_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libprotobuf-6.33.5-hff14b61_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libraqm-0.10.5-hcf81f31_0.conda @@ -4464,7 +4464,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -4517,7 +4517,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/c-ares-1.34.6-hc919400_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py314hf8a3a22_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/freetype-2.14.3-hce30654_1.conda @@ -4539,11 +4539,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/lcms2-2.19.1-hdfa7624_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/lerc-4.1.0-h1eee2c3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libabseil-20260107.1-cxx17_h2062a1b_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h1caba66_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h6045e8e_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libblas-3.11.0-8_h51639a9_openblas.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libbrotlicommon-1.2.0-hc919400_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libbrotlidec-1.2.0-hc919400_1.conda @@ -4565,8 +4565,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgfortran-15.2.0-h07b0088_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgfortran5-15.2.0-hdae7583_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libglib-2.88.1-ha08bb59_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.5.0-h688a705_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.5.0-ha114238_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.6.0-h688a705_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.6.0-ha114238_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgrpc-1.78.1-h3e3f78d_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libiconv-1.18-h23cfdf5_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libintl-0.25.1-h493aca8_0.conda @@ -4578,7 +4578,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopenblas-0.3.33-openmp_he657e61_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopentelemetry-cpp-1.27.0-h08d5cc3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopentelemetry-cpp-headers-1.27.0-hce30654_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libpng-1.6.58-h132b30e_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libprotobuf-6.33.5-h2d4b707_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libraqm-0.10.5-h29bd36a_0.conda @@ -4679,7 +4679,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -4733,7 +4733,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/c-ares-1.34.6-hfd05255_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py314hf309875_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -4750,11 +4750,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/lcms2-2.19.1-hf2c6c5f_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/lerc-4.1.0-hd936e49_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libabseil-20260107.1-cxx17_h0eb2380_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h54e786e_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h9dce539_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-8_h8455456_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlicommon-1.2.0-hfd05255_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlidec-1.2.0-hfd05255_1.conda @@ -4773,8 +4773,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libgd-2.3.3-h4974f7c_12.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libglib-2.88.1-h7ce1215_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libgomp-15.2.0-h8ee18e1_19.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.5.0-he22669a_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.5.0-he04ea4c_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.6.0-he22669a_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.6.0-he04ea4c_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libgrpc-1.78.1-h9ff2b3e_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.13.0-default_h049141e_1000.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libiconv-1.18-hc1393d2_2.conda @@ -4785,7 +4785,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-hfd05255_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopentelemetry-cpp-1.27.0-hc88f397_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopentelemetry-cpp-headers-1.27.0-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.58-h7351971_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libprotobuf-6.33.5-h6cf2d3c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libraqm-0.10.5-h781ae3c_0.conda @@ -4876,7 +4876,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py314h97ea11e_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.4.0-hecca717_0.conda @@ -5056,7 +5056,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -5129,7 +5129,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -5163,7 +5163,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h500dc9f_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h7656bdc_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.3-py314h22a2ed9_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/freetype-2.14.3-h694c41f_1.conda @@ -5290,7 +5290,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -5324,7 +5324,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_9.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py314hf8a3a22_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/freetype-2.14.3-hce30654_1.conda @@ -5450,7 +5450,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -5485,7 +5485,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h0ad9c76_9.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py314hf309875_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -6169,7 +6169,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py312h460c074_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py312h0a2e395_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py312h8a5da7c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py312h8a5da7c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/debugpy-1.8.21-py312h8285ef7_0.conda @@ -6466,7 +6466,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -6504,7 +6504,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -6539,6 +6539,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -6667,7 +6669,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -6706,7 +6708,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -6741,6 +6743,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -6803,7 +6807,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cffi-2.0.0-py313h224173a_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py313h2af2deb_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py313h65a2061_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py313h65a2061_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/debugpy-1.8.21-py313h1188861_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fmt-12.1.0-h403dcb5_0.conda @@ -7046,7 +7050,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -7082,7 +7086,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -7117,6 +7121,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -7174,7 +7180,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cffi-2.0.0-py313h5ea7bf4_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py313h1a38498_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py313hd650c13_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py313hd650c13_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/debugpy-1.8.21-py313h927ade5_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fmt-12.1.0-h7f4e812_0.conda @@ -7643,7 +7649,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -7703,6 +7709,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -7809,7 +7817,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -7870,6 +7878,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -8150,7 +8160,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.6.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhcf101f3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.22.1-pyhcf101f3_0.conda @@ -8209,6 +8219,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.21.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-last-updated-by-git-0.3.8-pyhe01879c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda @@ -9068,7 +9080,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-he90730b_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.3-py314h97ea11e_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.4.0-hecca717_0.conda @@ -9095,11 +9107,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb646d72_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb642ee7_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-8_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda @@ -9132,8 +9144,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-devel-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_19.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.5.0-h8d2ee43_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.5.0-hdbdcf42_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.6.0-h8d2ee43_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.6.0-hdbdcf42_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.78.1-h1d1128b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.4.1-hb03c661_0.conda @@ -9147,7 +9159,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.27.0-h9692893_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.27.0-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.19-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.58-h421ea60_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libpq-18.4-hd5a49e9_0.conda @@ -9282,7 +9294,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -9355,7 +9367,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -9408,7 +9420,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/c-ares-1.34.6-hb5e19a0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cairo-1.18.4-h7656bdc_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.3-py314h22a2ed9_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/fontconfig-2.18.1-h7a4440b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/freetype-2.14.3-h694c41f_1.conda @@ -9430,11 +9442,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/lcms2-2.19.1-h5ea7634_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/lerc-4.1.0-h35c7297_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libabseil-20260107.1-cxx17_h7ed6875_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-hf9fdb71_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-haea8852_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libblas-3.11.0-8_he492b99_openblas.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libbrotlicommon-1.2.0-h8616949_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libbrotlidec-1.2.0-h8616949_1.conda @@ -9456,8 +9468,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/libgfortran-15.2.0-h7e5c614_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libgfortran5-15.2.0-hd16e46c_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libglib-2.88.1-hf28f236_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.5.0-h8b848e0_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.5.0-hea209c6_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.6.0-h8b848e0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.6.0-hea209c6_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libgrpc-1.78.1-h147dede_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libiconv-1.18-h57a12c2_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libintl-0.25.1-h3184127_1.conda @@ -9469,7 +9481,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/libopenblas-0.3.33-openmp_h9e49c7b_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libopentelemetry-cpp-1.27.0-h7a0a166_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libopentelemetry-cpp-headers-1.27.0-h694c41f_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libpng-1.6.58-he930e7c_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libprotobuf-6.33.5-hff14b61_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/libraqm-0.10.5-hcf81f31_0.conda @@ -9571,7 +9583,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -9624,7 +9636,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/c-ares-1.34.6-hc919400_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cairo-1.18.4-he0f2337_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/contourpy-1.3.3-py314hf8a3a22_4.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/epoxy-1.5.10-hc919400_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fontconfig-2.18.1-h2b252f5_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/freetype-2.14.3-hce30654_1.conda @@ -9646,11 +9658,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/lcms2-2.19.1-hdfa7624_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/lerc-4.1.0-h1eee2c3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libabseil-20260107.1-cxx17_h2062a1b_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h1caba66_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h6045e8e_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libblas-3.11.0-8_h51639a9_openblas.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libbrotlicommon-1.2.0-hc919400_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libbrotlidec-1.2.0-hc919400_1.conda @@ -9672,8 +9684,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgfortran-15.2.0-h07b0088_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgfortran5-15.2.0-hdae7583_19.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libglib-2.88.1-ha08bb59_2.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.5.0-h688a705_1.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.5.0-ha114238_1.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.6.0-h688a705_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.6.0-ha114238_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgrpc-1.78.1-h3e3f78d_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libiconv-1.18-h23cfdf5_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libintl-0.25.1-h493aca8_0.conda @@ -9685,7 +9697,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopenblas-0.3.33-openmp_he657e61_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopentelemetry-cpp-1.27.0-h08d5cc3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libopentelemetry-cpp-headers-1.27.0-hce30654_0.conda - - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libpng-1.6.58-h132b30e_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libprotobuf-6.33.5-h2d4b707_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libraqm-0.10.5-h29bd36a_0.conda @@ -9786,7 +9798,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.20.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.3.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyh09c184e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda @@ -9840,7 +9852,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/c-ares-1.34.6-hfd05255_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h477c42c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.3-py314hf309875_4.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.4.0-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.18.1-hd47e2ca_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/freetype-2.14.3-h57928b3_1.conda @@ -9857,11 +9869,11 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/lcms2-2.19.1-hf2c6c5f_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/lerc-4.1.0-hd936e49_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libabseil-20260107.1-cxx17_h0eb2380_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h54e786e_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_7_cpu.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h9dce539_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_8_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-8_h8455456_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlicommon-1.2.0-hfd05255_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libbrotlidec-1.2.0-hfd05255_1.conda @@ -9880,8 +9892,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libgd-2.3.3-h4974f7c_12.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libglib-2.88.1-h7ce1215_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libgomp-15.2.0-h8ee18e1_19.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.5.0-he22669a_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.5.0-he04ea4c_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.6.0-he22669a_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.6.0-he04ea4c_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libgrpc-1.78.1-h9ff2b3e_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.13.0-default_h049141e_1000.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libiconv-1.18-hc1393d2_2.conda @@ -9892,7 +9904,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-hfd05255_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopentelemetry-cpp-1.27.0-hc88f397_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopentelemetry-cpp-headers-1.27.0-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_7_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_8_cpu.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.58-h7351971_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libprotobuf-6.33.5-h6cf2d3c_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libraqm-0.10.5-h781ae3c_0.conda @@ -10806,6 +10818,7 @@ packages: - libzlib >=1.3.2,<2.0a0 - aws-c-common >=0.14.0,<0.14.1.0a0 license: Apache-2.0 + license_family: APACHE purls: [] run_exports: weak: @@ -11363,9 +11376,9 @@ packages: run_exports: {} size: 324013 timestamp: 1769155968691 -- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py310h3406613_0.conda - sha256: eed3e0f62c8c3be2e3660f72ca735bbea9bea595c013909f2d5e56639fc316c7 - md5: 41486f4c383c638f8a2e5b9e9922748e +- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py310h3406613_0.conda + sha256: d5ff485f9134e91657bf894fe6535fbdf54e41b11238c6b37701f5a605bfb66a + md5: a53275194d9c40d82ac81e89dccae517 depends: - __glibc >=2.17,<3.0.a0 - libgcc >=14 @@ -11374,13 +11387,13 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 318098 - timestamp: 1781985009030 -- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py311h3778330_0.conda - sha256: 3254419a2f43a5eeb7bbadde029c52c3ac3ce91e890880af5af1a0cc32f393ee - md5: 4f531f4944ed9aaf1961d6b6735028e9 + size: 318490 + timestamp: 1782178112039 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py311h3778330_0.conda + sha256: a143654fedbc23b70b6acc2077e2b6eaf5ff05b9f311084ffe5652d39e9d2020 + md5: fd575752ccdef69e8381a26d641d04a4 depends: - __glibc >=2.17,<3.0.a0 - libgcc >=14 @@ -11391,11 +11404,11 @@ packages: purls: - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 404985 - timestamp: 1781984891678 -- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py312h8a5da7c_0.conda - sha256: 407b63be0b3288e775a029101836bdf86c2433e853149c58392d84b9d36b72c4 - md5: cb33d381f9299e24c8bb859223742aef + size: 405041 + timestamp: 1782178072991 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py312h8a5da7c_0.conda + sha256: 15b33937f062c7c94f5978127fbcae1d3b9c30ff4dff59adab9ab2bd18365024 + md5: 685d6d2fac5fd5abfc581db3c94652b9 depends: - __glibc >=2.17,<3.0.a0 - libgcc >=14 @@ -11404,13 +11417,13 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 394056 - timestamp: 1781984929803 -- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.2-py314h67df5f8_0.conda - sha256: 3f2eddbfeff95b4ddb00ab8569c6c0687a8558dfc8c729f9b8126c2265eeb8e2 - md5: 12894cdaed7259b00ccce63806598ca8 + size: 393058 + timestamp: 1782178186395 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.14.3-py314h67df5f8_0.conda + sha256: 68f6814d548e6b1d8b655371cb34b909f5862d5d80b4b6ccf7231a84ceeb88da + md5: e3aecdc8eab8a93c5aad5f113fa91509 depends: - __glibc >=2.17,<3.0.a0 - libgcc >=14 @@ -11419,10 +11432,10 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 419850 - timestamp: 1781985049797 + size: 419228 + timestamp: 1782178151393 - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hac629b4_1.conda sha256: 7684da83306bb69686c0506fb09aa7074e1a55ade50c3a879e4e5df6eebb1009 md5: af491aae930edc096b58466c51c4126c @@ -11602,7 +11615,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/fonttools?source=hash-mapping + - pkg:pypi/fonttools?source=compressed-mapping run_exports: {} size: 3045399 timestamp: 1778770357867 @@ -11620,7 +11633,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/fonttools?source=hash-mapping + - pkg:pypi/fonttools?source=compressed-mapping run_exports: {} size: 3007892 timestamp: 1778770568019 @@ -12545,10 +12558,10 @@ packages: - libarrow >=20.0.0,<20.1.0a0 size: 9438373 timestamp: 1774279501142 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb646d72_7_cpu.conda - build_number: 7 - sha256: 5bb6b744f6f488ea75f9161175dc1740a8e5bc4bf4201bf4b84e5f4138414c78 - md5: 955fc6cc7d4dad4bdcc792141a43b5cb +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-24.0.0-hb642ee7_8_cpu.conda + build_number: 8 + sha256: b30e965aa4b57413da99690e01473dc81a6a24ce1f7548f102350c1d26f4f08a + md5: 5e12d802f30c0a1d9c3db30133fe1ec3 depends: - __glibc >=2.17,<3.0.a0 - aws-crt-cpp >=0.40.1,<0.40.2.0a0 @@ -12564,8 +12577,8 @@ packages: - libbrotlidec >=1.2.0,<1.3.0a0 - libbrotlienc >=1.2.0,<1.3.0a0 - libgcc >=14 - - libgoogle-cloud >=3.5.0,<3.6.0a0 - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - libstdcxx >=14 @@ -12575,16 +12588,16 @@ packages: - snappy >=1.2.2,<1.3.0a0 - zstd >=1.5.7,<1.6.0a0 constrains: - - parquet-cpp <0.0a0 - - arrow-cpp <0.0a0 - apache-arrow-proc =*=cpu + - arrow-cpp <0.0a0 + - parquet-cpp <0.0a0 license: Apache-2.0 purls: [] run_exports: weak: - libarrow >=24.0.0,<24.1.0a0 - size: 6525708 - timestamp: 1781907939132 + size: 6522840 + timestamp: 1782184573454 - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-15.0.2-h7599340_55_cpu.conda build_number: 55 sha256: 9842fe6ba600f21332a9c2d0f671a3b06ba07792d4d5d10139f7ccfdddb04cf8 @@ -12620,14 +12633,14 @@ packages: - libarrow-acero >=20.0.0,<20.1.0a0 size: 669282 timestamp: 1774279586712 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_7_cpu.conda - build_number: 7 - sha256: 3e84d52908eb55a17dd3e907b195246ccfaef3171107e67b107be11c5c137f27 - md5: ac99e1831a4e498755a32d72820e44db +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-24.0.0-h635bf11_8_cpu.conda + build_number: 8 + sha256: 7d5ff43ac1492f1f7be0b8f497d2ed9782b391a7573aa4f582bf5bb012b33a80 + md5: 7ee20a0ce202d7f8c1c80aeb15427874 depends: - __glibc >=2.17,<3.0.a0 - - libarrow 24.0.0 hb646d72_7_cpu - - libarrow-compute 24.0.0 h53684a4_7_cpu + - libarrow 24.0.0 hb642ee7_8_cpu + - libarrow-compute 24.0.0 h53684a4_8_cpu - libgcc >=14 - libstdcxx >=14 license: Apache-2.0 @@ -12635,15 +12648,15 @@ packages: run_exports: weak: - libarrow-acero >=24.0.0,<24.1.0a0 - size: 590906 - timestamp: 1781908115933 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_7_cpu.conda - build_number: 7 - sha256: 50bf05387ebef61649521a6e1a8fb9a666f0b3cb317ef99a239a8ed0f29c73fb - md5: d96583ed99278f50296ebbe220e23f0b + size: 591077 + timestamp: 1782184817230 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-24.0.0-h53684a4_8_cpu.conda + build_number: 8 + sha256: 0e5a9c2080effae7fd660453eedabedc4945eb9752a81062459f77308e3793a6 + md5: dd1398cbd330470f75f202ac99225ed8 depends: - __glibc >=2.17,<3.0.a0 - - libarrow 24.0.0 hb646d72_7_cpu + - libarrow 24.0.0 hb642ee7_8_cpu - libgcc >=14 - libre2-11 >=2025.11.5 - libstdcxx >=14 @@ -12654,8 +12667,8 @@ packages: run_exports: weak: - libarrow-compute >=24.0.0,<24.1.0a0 - size: 2988711 - timestamp: 1781908000610 + size: 2992150 + timestamp: 1782184697848 - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-15.0.2-h7599340_55_cpu.conda build_number: 55 sha256: fb6185f6b6f854d696ed890cf03f611a6941aa4c78fde585f542c5e8e813aab1 @@ -12695,25 +12708,25 @@ packages: - libarrow-dataset >=20.0.0,<20.1.0a0 size: 638107 timestamp: 1774279729327 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_7_cpu.conda - build_number: 7 - sha256: 9fd49a3532788e06ccbae5993005bafe2998ffcc3a33a0b42764590070b0ac12 - md5: 930d75defa0600cc52704fbedb0e4280 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-24.0.0-h635bf11_8_cpu.conda + build_number: 8 + sha256: 32fc98ff80fd72b4dd8d8b0a2f49c5e1d778f26e29d91314fa5af3f687e63e4c + md5: 7b0fe5832f7f4f9bbccfbe599482e5aa depends: - __glibc >=2.17,<3.0.a0 - - libarrow 24.0.0 hb646d72_7_cpu - - libarrow-acero 24.0.0 h635bf11_7_cpu - - libarrow-compute 24.0.0 h53684a4_7_cpu + - libarrow 24.0.0 hb642ee7_8_cpu + - libarrow-acero 24.0.0 h635bf11_8_cpu + - libarrow-compute 24.0.0 h53684a4_8_cpu - libgcc >=14 - - libparquet 24.0.0 h7376487_7_cpu + - libparquet 24.0.0 h7376487_8_cpu - libstdcxx >=14 license: Apache-2.0 purls: [] run_exports: weak: - libarrow-dataset >=24.0.0,<24.1.0a0 - size: 590790 - timestamp: 1781908195795 + size: 590422 + timestamp: 1782184900125 - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-flight-15.0.2-h1f524f1_55_cpu.conda build_number: 55 sha256: 07566dc71f150a34872bd92078bddf06990ea9aac564f73b648369eef0b36b83 @@ -12823,17 +12836,17 @@ packages: - libarrow-substrait >=20.0.0,<20.1.0a0 size: 529670 timestamp: 1774279833247 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_7_cpu.conda - build_number: 7 - sha256: a1f2909056c3535a5408cd1b60c1f6b90f92817a205ea3ff8e2aec42da9856f6 - md5: bb59cc5c481ef1c15212c303e15489b3 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-24.0.0-hb4dd7c2_8_cpu.conda + build_number: 8 + sha256: cc0e3f6ef64d2bc60eefd84e06e122de600249433f6a50f4f092bca31f8dcdc5 + md5: f03a27d9512c5754cc1d189b9ca78204 depends: - __glibc >=2.17,<3.0.a0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hb646d72_7_cpu - - libarrow-acero 24.0.0 h635bf11_7_cpu - - libarrow-dataset 24.0.0 h635bf11_7_cpu + - libarrow 24.0.0 hb642ee7_8_cpu + - libarrow-acero 24.0.0 h635bf11_8_cpu + - libarrow-dataset 24.0.0 h635bf11_8_cpu - libgcc >=14 - libprotobuf >=6.33.5,<6.33.6.0a0 - libstdcxx >=14 @@ -12842,8 +12855,8 @@ packages: run_exports: weak: - libarrow-substrait >=24.0.0,<24.1.0a0 - size: 500849 - timestamp: 1781908222418 + size: 500657 + timestamp: 1782184927343 - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-8_h4a7cf45_openblas.conda build_number: 8 sha256: b2da6bfd72a1c9cb143ccf64bf5b28790cb4eb58bd1cb978f6537b2322f7d48b @@ -13646,9 +13659,9 @@ packages: - libgoogle-cloud >=3.3.0,<3.4.0a0 size: 2558266 timestamp: 1774212240265 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.5.0-h8d2ee43_1.conda - sha256: 42c8ca362013d0378ba58afb61940d23c94e0f7127004190dcd12fe4a3072953 - md5: 8ae0593085ca8148fdbf0bc8f62e79c1 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-3.6.0-h8d2ee43_0.conda + sha256: eb6fe89a6e2ffa6b485c437022e15d2173c2da3ada86690cf250bcfe6f6382d5 + md5: 50a88a9c7d89d854336c633966b67e56 depends: - __glibc >=2.17,<3.0.a0 - libabseil * cxx17* @@ -13659,17 +13672,17 @@ packages: - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - libstdcxx >=14 - - openssl >=3.5.6,<4.0a0 + - openssl >=3.5.7,<4.0a0 constrains: - - libgoogle-cloud 3.5.0 *_1 + - libgoogle-cloud 3.6.0 *_0 license: Apache-2.0 license_family: Apache purls: [] run_exports: weak: - - libgoogle-cloud >=3.5.0,<3.6.0a0 - size: 2647694 - timestamp: 1780029060448 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + size: 2680630 + timestamp: 1781922536584 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.34.0-h0121fbd_0.conda sha256: aa1b3b30ae6b2eab7c9e6a8e2fd8ec3776f25d2e3f0b6f9dc547ff8083bf25fa md5: 9f0c43225243c81c6991733edcaafff5 @@ -13712,16 +13725,16 @@ packages: - libgoogle-cloud-storage >=3.3.0,<3.4.0a0 size: 779217 timestamp: 1774212426084 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.5.0-hdbdcf42_1.conda - sha256: 6914f9b0f2d5bb0c5687b880c6c352a2333449d03ce80e6826230675062b57f1 - md5: 6f79d5f72cfcdd3509112233a8aedc2e +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-3.6.0-hdbdcf42_0.conda + sha256: 2d94ab8302408d34894024a80604e85936bb208a487841a222cafdb15c143f23 + md5: a5001567e3c2758834d63129ecb89bb1 depends: - __glibc >=2.17,<3.0.a0 - libabseil - libcrc32c >=1.1.2,<1.2.0a0 - libcurl - libgcc >=14 - - libgoogle-cloud 3.5.0 h8d2ee43_1 + - libgoogle-cloud 3.6.0 h8d2ee43_0 - libstdcxx >=14 - libzlib >=1.3.2,<2.0a0 - openssl @@ -13730,9 +13743,9 @@ packages: purls: [] run_exports: weak: - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 - size: 779116 - timestamp: 1780029183339 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 + size: 785866 + timestamp: 1781922659639 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.67.1-h25350d4_2.conda sha256: 675ab892e51614d511317f704564c8c0a8b85e7620948f733eff99800ad25570 md5: bfcedaf5f9b003029cc6abe9431f66bf @@ -14223,13 +14236,13 @@ packages: - libparquet >=20.0.0,<20.1.0a0 size: 1266871 timestamp: 1774279693519 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_7_cpu.conda - build_number: 7 - sha256: 5d69cc37ef693176cc42e14bd9cab41001f7da1967d66b478fd4bfb8a9b84b3d - md5: a62bee3afc7722d5c2598b24b1d9cb62 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-24.0.0-h7376487_8_cpu.conda + build_number: 8 + sha256: 9d466a57037ee713cf954c04a5c8756f0042c54d0d698f3f918f2df8bd77b5b1 + md5: 2b1feb7e1c3900157172fac5a69b6252 depends: - __glibc >=2.17,<3.0.a0 - - libarrow 24.0.0 hb646d72_7_cpu + - libarrow 24.0.0 hb642ee7_8_cpu - libgcc >=14 - libstdcxx >=14 - libthrift >=0.22.0,<0.22.1.0a0 @@ -14239,8 +14252,8 @@ packages: run_exports: weak: - libparquet >=24.0.0,<24.1.0a0 - size: 1426290 - timestamp: 1781908088327 + size: 1426831 + timestamp: 1782184788047 - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.19-hb03c661_0.conda sha256: f41721636a7c2e51bc2c642e1127955ab9c81145470714fdaac44d4d09e4af41 md5: 33082e13b4769b48cfeb648e15bfe3fc @@ -15440,6 +15453,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -15462,6 +15476,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -16077,7 +16092,7 @@ packages: license: Apache-2.0 license_family: APACHE purls: - - pkg:pypi/propcache?source=hash-mapping + - pkg:pypi/propcache?source=compressed-mapping run_exports: {} size: 51401 timestamp: 1780037772959 @@ -16092,7 +16107,7 @@ packages: license: Apache-2.0 license_family: APACHE purls: - - pkg:pypi/propcache?source=hash-mapping + - pkg:pypi/propcache?source=compressed-mapping run_exports: {} size: 51586 timestamp: 1780037816755 @@ -16587,7 +16602,7 @@ packages: license: BSD-2-Clause license_family: BSD purls: - - pkg:pypi/xxhash?source=hash-mapping + - pkg:pypi/xxhash?source=compressed-mapping run_exports: {} size: 24805 timestamp: 1779976911988 @@ -16680,7 +16695,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/torch?source=hash-mapping + - pkg:pypi/torch?source=compressed-mapping run_exports: weak: - pytorch >=2.12.0,<2.13.0a0 @@ -17274,7 +17289,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/scikit-learn?source=hash-mapping + - pkg:pypi/scikit-learn?source=compressed-mapping run_exports: {} size: 10311253 timestamp: 1780401051520 @@ -17320,6 +17335,7 @@ packages: - python >=3.12,<3.13.0a0 - python_abi 3.12.* *_cp312 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -17343,6 +17359,7 @@ packages: - python >=3.14,<3.15.0a0 - python_abi 3.14.* *_cp314 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -17594,7 +17611,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 881976 timestamp: 1781006805257 @@ -17609,7 +17626,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=compressed-mapping + - pkg:pypi/tornado?source=hash-mapping run_exports: {} size: 864705 timestamp: 1781006801632 @@ -17624,7 +17641,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 918368 timestamp: 1781006801436 @@ -17859,6 +17876,7 @@ packages: - __glibc >=2.17,<3.0.a0 - xorg-libx11 >=1.8.13,<2.0a0 license: MIT + license_family: MIT purls: [] run_exports: {} size: 441670 @@ -18656,7 +18674,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/click?source=hash-mapping + - pkg:pypi/click?source=compressed-mapping run_exports: {} size: 104080 timestamp: 1779900586237 @@ -19303,7 +19321,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/ipykernel?source=compressed-mapping + - pkg:pypi/ipykernel?source=hash-mapping run_exports: {} size: 138635 timestamp: 1781101665847 @@ -19328,7 +19346,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/ipython?source=compressed-mapping + - pkg:pypi/ipython?source=hash-mapping run_exports: {} size: 652893 timestamp: 1780654403616 @@ -19353,7 +19371,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/ipython?source=hash-mapping + - pkg:pypi/ipython?source=compressed-mapping run_exports: {} size: 652076 timestamp: 1780654438137 @@ -19441,6 +19459,7 @@ packages: depends: - python >=3.10 license: Apache-2.0 + license_family: APACHE purls: - pkg:pypi/json5?source=compressed-mapping run_exports: {} @@ -19557,7 +19576,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/jupyter-client?source=hash-mapping + - pkg:pypi/jupyter-client?source=compressed-mapping run_exports: {} size: 117954 timestamp: 1781019994076 @@ -19856,7 +19875,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/mdit-py-plugins?source=compressed-mapping + - pkg:pypi/mdit-py-plugins?source=hash-mapping run_exports: {} size: 50460 timestamp: 1778692223625 @@ -19886,9 +19905,9 @@ packages: run_exports: {} size: 36168 timestamp: 1764885507963 -- conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.2.1-pyhcf101f3_0.conda - sha256: b52dc6c78fbbe7a3008535cb8bfd87d70d8053e9250bbe16e387470a9df07070 - md5: b97e84d1553b4a1c765b87fff83453ad +- conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.3.1-pyhcf101f3_0.conda + sha256: 240fbb25ca907465df57fe5ba2b040fc868fa88dfa7da42741b3b8bd092b4f17 + md5: 1fe73f1762c2114c946cf2e7f074cc43 depends: - python >=3.10 - typing_extensions @@ -19896,10 +19915,10 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/mistune?source=hash-mapping + - pkg:pypi/mistune?source=compressed-mapping run_exports: {} - size: 74567 - timestamp: 1777824616382 + size: 86966 + timestamp: 1782128220984 - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.4.1-pyhd8ed1ab_0.conda sha256: 5bbf2f8179ec43d34d67ca8e4989d216c1bdb4b749fe6cb40e86ebf88c1b5300 md5: 2e81b32b805f406d23ba61938a184081 @@ -20268,7 +20287,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/polars?source=hash-mapping + - pkg:pypi/polars?source=compressed-mapping run_exports: {} size: 540108 timestamp: 1780146392384 @@ -20498,10 +20517,11 @@ packages: run_exports: {} size: 21085 timestamp: 1733217331982 -- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_1.conda - sha256: 5df2fdef7862720d45482ed1519ad1188f7b49f802c3a9ea9e141c7ffa911258 - md5: a4b80078d87b335d39c447e20ae857c2 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.1.1-pyhc364b38_2.conda + sha256: 430051d80765207a7d782b2b188230ba1489d35c6e75fd9903f76cb9fda4af16 + md5: 64c98a12c4e23eb238bf66bbecafdf3c depends: + - colorama - pygments >=2.7.2 - python >=3.10 - iniconfig >=1.0.1 @@ -20513,11 +20533,12 @@ packages: constrains: - pytest-faulthandler >=2 license: MIT + license_family: MIT purls: - pkg:pypi/pytest?source=compressed-mapping run_exports: {} - size: 306672 - timestamp: 1781879457958 + size: 306724 + timestamp: 1782127176429 - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.1.0-pyhcf101f3_0.conda sha256: 44e42919397bd00bfaa47358a6ca93d4c21493a8c18600176212ec21a8d25ca5 md5: 67d1790eefa81ed305b89d8e314c7923 @@ -20575,7 +20596,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/python-discovery?source=hash-mapping + - pkg:pypi/python-discovery?source=compressed-mapping run_exports: {} size: 35514 timestamp: 1781257630962 @@ -20645,7 +20666,7 @@ packages: license: BSD-2-Clause license_family: BSD purls: - - pkg:pypi/python-json-logger?source=hash-mapping + - pkg:pypi/python-json-logger?source=compressed-mapping run_exports: {} size: 19249 timestamp: 1781036004580 @@ -21250,6 +21271,34 @@ packages: run_exports: {} size: 17546 timestamp: 1750694360605 +- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784 + md5: 6b170f1a7d5c1729073c354b2d0ac32d + depends: + - python >=3.10 + - sphinx + license: MIT + license_family: MIT + purls: + - pkg:pypi/sphinx-llms-txt?source=hash-mapping + run_exports: {} + size: 25685 + timestamp: 1765935234507 +- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-markdown-builder-0.6.10-pyhd8ed1ab_0.conda + sha256: 57079789716b56cc198c1f8518d9422c62380ebc7cb77b3170ece04b1d914f17 + md5: e804fed0abd0c8df4ff40e3084d724a0 + depends: + - docutils + - python >=3.10 + - sphinx >=5.1.0 + - tabulate + license: MIT + license_family: MIT + purls: + - pkg:pypi/sphinx-markdown-builder?source=hash-mapping + run_exports: {} + size: 22212 + timestamp: 1773231549728 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-sitemap-2.9.0-pyhcf101f3_0.conda sha256: 1be6289124207256df5dfbfe6ff0a652e313ac5c3e50560c9e510afa76eb702b md5: 3baeff262222dc87e978a68702bc5797 @@ -21337,7 +21386,7 @@ packages: license: BSD-2-Clause license_family: BSD purls: - - pkg:pypi/sphinxcontrib-serializinghtml?source=hash-mapping + - pkg:pypi/sphinxcontrib-serializinghtml?source=compressed-mapping run_exports: {} size: 30640 timestamp: 1781260357443 @@ -22213,6 +22262,7 @@ packages: - aws-c-event-stream >=0.7.1,<0.7.2.0a0 - aws-crt-cpp >=0.40.1,<0.40.2.0a0 license: Apache-2.0 + license_family: APACHE purls: [] run_exports: weak: @@ -22534,9 +22584,9 @@ packages: run_exports: {} size: 301747 timestamp: 1769156235399 -- conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py310h399bfa0_0.conda - sha256: 79f8859336c9206dc4e94c2955e92061b3c190d2599fe5092189ca8ccfb38400 - md5: 4ca376f9161dce28b811ead2bbad6c35 +- conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py310h399bfa0_0.conda + sha256: 036b5c73f083dbe1101774d2eb3335a7399d444b0ca2ceca93188bfb0cae17a9 + md5: acaf1482fd8aa35122e51719342574f3 depends: - __osx >=11.0 - python >=3.10,<3.11.0a0 @@ -22544,13 +22594,13 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 317359 - timestamp: 1781985155915 -- conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.2-py314h77fa6c7_0.conda - sha256: b1d80c7e7627c539e1df5c755f395e9a5632648a7784f7afd82877ee4097af33 - md5: be018c59f300e5d699a6628643ac70fa + size: 316607 + timestamp: 1782178307947 +- conda: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.14.3-py314h77fa6c7_0.conda + sha256: 2ce69da279b58d54aae1de8e2255e12b0b1312f23d6a8d1ee00c2987697710cb + md5: 14b8c111c28ab7e00108c5af338efc2e depends: - __osx >=11.0 - python >=3.14,<3.15.0a0 @@ -22558,10 +22608,10 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 417492 - timestamp: 1781985129168 + size: 416913 + timestamp: 1782178576118 - conda: https://conda.anaconda.org/conda-forge/osx-64/epoxy-1.5.10-h8616949_2.conda sha256: d5c466bddf423a788ce5c39af20af41ebaf3de9dc9e807098fc9bf45c3c7db45 md5: efe7fa6c60b20cb0a3a22e8c3e7b721e @@ -23096,10 +23146,10 @@ packages: - libarrow >=15.0.2,<16.0a0 size: 5760884 timestamp: 1737669783258 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-hf9fdb71_7_cpu.conda - build_number: 7 - sha256: acaa55957d26f70a34a1805beef8ab15e33eab273bfe0f848bd1788a9664fbc1 - md5: e8dd2a086d53bd982793102512c89982 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-24.0.0-haea8852_8_cpu.conda + build_number: 8 + sha256: 83b20f3199dc1a862dba28f49dfed5ee02bceb0489c1988636210a9a249477f4 + md5: 431d0fc7fe61570d0eb26763ce93a081 depends: - __osx >=11.0 - aws-crt-cpp >=0.40.1,<0.40.2.0a0 @@ -23115,8 +23165,8 @@ packages: - libbrotlidec >=1.2.0,<1.3.0a0 - libbrotlienc >=1.2.0,<1.3.0a0 - libcxx >=21 - - libgoogle-cloud >=3.5.0,<3.6.0a0 - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - libzlib >=1.3.2,<2.0a0 @@ -23126,15 +23176,15 @@ packages: - zstd >=1.5.7,<1.6.0a0 constrains: - arrow-cpp <0.0a0 - - parquet-cpp <0.0a0 - apache-arrow-proc =*=cpu + - parquet-cpp <0.0a0 license: Apache-2.0 purls: [] run_exports: weak: - libarrow >=24.0.0,<24.1.0a0 - size: 4382939 - timestamp: 1781910123075 + size: 4385382 + timestamp: 1782185595616 - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-15.0.2-he6f7923_55_cpu.conda build_number: 55 sha256: f26c9c176ba41c3bd417bffec845f059d1cadb3e4c69c8299e7a6dbd34371112 @@ -23151,16 +23201,16 @@ packages: - libarrow-acero >=15.0.2,<16.0a0 size: 531141 timestamp: 1737669909951 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_7_cpu.conda - build_number: 7 - sha256: 59652af9a33aa1549ac4b2ac2434f0ab2eb84ff73fe41901f1f7e6ee7de6a8ab - md5: 63c0bde25b99d990e13198b0569e7da7 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-acero-24.0.0-h91633f5_8_cpu.conda + build_number: 8 + sha256: ab56bd77f8719833f7818bbfd423682206502243bfcfb76d82dee6bf240e715f + md5: c2a280c00920f8f8279744cadee14a69 depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hf9fdb71_7_cpu - - libarrow-compute 24.0.0 hb38465b_7_cpu + - libarrow 24.0.0 haea8852_8_cpu + - libarrow-compute 24.0.0 hb38465b_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -23169,17 +23219,17 @@ packages: run_exports: weak: - libarrow-acero >=24.0.0,<24.1.0a0 - size: 543653 - timestamp: 1781910650718 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_7_cpu.conda - build_number: 7 - sha256: edfea918f6c999dec73c275a773912a0d16f3262684516e28b5411f4c6be7c93 - md5: 350e4a18d883c23f55782be4ad41d5dd + size: 543667 + timestamp: 1782186203553 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-compute-24.0.0-hb38465b_8_cpu.conda + build_number: 8 + sha256: 7b888f962e2a5656afb59f18d9cf4bbb711d5f36a2d1c4ecc8eb4ed4f0dc0961 + md5: 941d153c204682ceada2d683eeb8d923 depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hf9fdb71_7_cpu + - libarrow 24.0.0 haea8852_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -23191,8 +23241,8 @@ packages: run_exports: weak: - libarrow-compute >=24.0.0,<24.1.0a0 - size: 2386224 - timestamp: 1781910310364 + size: 2385343 + timestamp: 1782185805595 - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-15.0.2-he6f7923_55_cpu.conda build_number: 55 sha256: 5d774bc414b12245ab31567079a86ffb3efb9f46f4d35f1b4723bcd5d3c661ec @@ -23211,28 +23261,28 @@ packages: - libarrow-dataset >=15.0.2,<16.0a0 size: 529321 timestamp: 1737671005879 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_7_cpu.conda - build_number: 7 - sha256: af341020d88b09d5accb8c8311a549c5c0b9e242f0f025c0e908f39da11d0de5 - md5: 3905be162b0965aad08f0b610a38bf9f +- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-dataset-24.0.0-h91633f5_8_cpu.conda + build_number: 8 + sha256: 98586c07943c91c47bb1e8edb4f69dc6b66db026a5b3aad5ffba58d37c03e38f + md5: 9df631d619319e6e8002c572426d523d depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hf9fdb71_7_cpu - - libarrow-acero 24.0.0 h91633f5_7_cpu - - libarrow-compute 24.0.0 hb38465b_7_cpu + - libarrow 24.0.0 haea8852_8_cpu + - libarrow-acero 24.0.0 h91633f5_8_cpu + - libarrow-compute 24.0.0 hb38465b_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - - libparquet 24.0.0 h0f82bca_7_cpu + - libparquet 24.0.0 h0f82bca_8_cpu - libprotobuf >=6.33.5,<6.33.6.0a0 license: Apache-2.0 purls: [] run_exports: weak: - libarrow-dataset >=24.0.0,<24.1.0a0 - size: 534287 - timestamp: 1781910914325 + size: 533279 + timestamp: 1782186488064 - conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-flight-15.0.2-hb1276e4_55_cpu.conda build_number: 55 sha256: e97954e95f78b4dab8ec5baa377f1f6695bcd05de3ab31bf54ab779fda315f8b @@ -23311,17 +23361,17 @@ packages: - libarrow-substrait >=15.0.2,<16.0a0 size: 439252 timestamp: 1737671145916 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_7_cpu.conda - build_number: 7 - sha256: fa77603b9094f4b19a1edcac5f9179b45193fa2995554e9bb7549691d3a2f0f3 - md5: 936820b1c781e6e73287d5f73a6e2000 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libarrow-substrait-24.0.0-h613493e_8_cpu.conda + build_number: 8 + sha256: ae0d0fdca5bf4503fd99f0a9f2a63c1180ddef62aab29a00ca460a54aea735c3 + md5: 8d47ac586d9d9a196f5d3001403a0809 depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hf9fdb71_7_cpu - - libarrow-acero 24.0.0 h91633f5_7_cpu - - libarrow-dataset 24.0.0 h91633f5_7_cpu + - libarrow 24.0.0 haea8852_8_cpu + - libarrow-acero 24.0.0 h91633f5_8_cpu + - libarrow-dataset 24.0.0 h91633f5_8_cpu - libcxx >=21 - libprotobuf >=6.33.5,<6.33.6.0a0 license: Apache-2.0 @@ -23329,8 +23379,8 @@ packages: run_exports: weak: - libarrow-substrait >=24.0.0,<24.1.0a0 - size: 448787 - timestamp: 1781911013226 + size: 448808 + timestamp: 1782186582807 - conda: https://conda.anaconda.org/conda-forge/osx-64/libblas-3.11.0-8_he492b99_openblas.conda build_number: 8 sha256: 55cf9f92a2d07c33f8a32c44ff1528ea48fd69677cc003a4532d09b71cb8a316 @@ -23765,9 +23815,9 @@ packages: - libgoogle-cloud >=2.34.0,<2.35.0a0 size: 897554 timestamp: 1737284704797 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.5.0-h8b848e0_1.conda - sha256: f6f23551b2f4b9c9b3e0c72398e4995702e832ee03b717e4d9802ce695f6938a - md5: 323f0d14ccec33e69a6c16a11f3ec7c1 +- conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-3.6.0-h8b848e0_0.conda + sha256: 93bc6400aaa20aad9de27c6f42f9c31dcddf8466ba9588c5bc4df644013267bf + md5: 0617521fb705f0c4b6ad40352f1666d1 depends: - __osx >=11.0 - libabseil * cxx17* @@ -23777,17 +23827,17 @@ packages: - libgrpc >=1.78.1,<1.79.0a0 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - - openssl >=3.5.6,<4.0a0 + - openssl >=3.5.7,<4.0a0 constrains: - - libgoogle-cloud 3.5.0 *_1 + - libgoogle-cloud 3.6.0 *_0 license: Apache-2.0 license_family: Apache purls: [] run_exports: weak: - - libgoogle-cloud >=3.5.0,<3.6.0a0 - size: 1882201 - timestamp: 1780030929238 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + size: 1858658 + timestamp: 1781924653666 - conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-2.34.0-h3f2b517_0.conda sha256: e4d78f5226cc319d578731b7736680c2b4c0c18663d6fb48ddf132d6c3913394 md5: c6962e0181e6edca75e236f8e0c1ea53 @@ -23808,16 +23858,16 @@ packages: - libgoogle-cloud-storage >=2.34.0,<2.35.0a0 size: 544381 timestamp: 1737285870673 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.5.0-hea209c6_1.conda - sha256: 086374067de8b3fd6198f87f8a7879d5042e35a7816e2a570155a3590e480a0d - md5: 8c84b06d18a3c83c28eb89bca378daad +- conda: https://conda.anaconda.org/conda-forge/osx-64/libgoogle-cloud-storage-3.6.0-hea209c6_0.conda + sha256: dc6272ad015a5d6a4cf4263ef11fd2d3889fdafbb9a049121aa6daaf7a165b4f + md5: 3ab72a6e7f7c1b54e2ace239d06e9e7a depends: - __osx >=11.0 - libabseil - libcrc32c >=1.1.2,<1.2.0a0 - libcurl - libcxx >=19 - - libgoogle-cloud 3.5.0 h8b848e0_1 + - libgoogle-cloud 3.6.0 h8b848e0_0 - libzlib >=1.3.2,<2.0a0 - openssl license: Apache-2.0 @@ -23825,9 +23875,9 @@ packages: purls: [] run_exports: weak: - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 - size: 541328 - timestamp: 1780031289207 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 + size: 541859 + timestamp: 1781924972932 - conda: https://conda.anaconda.org/conda-forge/osx-64/libgrpc-1.67.1-h4896ac0_2.conda sha256: 1704fc25a408d89d5efd841ad0a3b42ba1a8b189afa40b89995c74da83058d91 md5: c1f24237a5024ae9b3820401511a1660 @@ -24096,15 +24146,15 @@ packages: - libparquet >=15.0.2,<16.0a0 size: 943787 timestamp: 1737670924761 -- conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_7_cpu.conda - build_number: 7 - sha256: 7102d7ad47b55bd4ae4d8a611611e4f9aa5219929e5325f49cce4fe252db58f2 - md5: 8986eeddb6032853bdfb24eaf1171e4d +- conda: https://conda.anaconda.org/conda-forge/osx-64/libparquet-24.0.0-h0f82bca_8_cpu.conda + build_number: 8 + sha256: 2a4c24da0a23de9cc2975c00660b9c112383003351526d0e7930db62ca2f7e1c + md5: 946d94475bb4ad57c60d771dda4527df depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 hf9fdb71_7_cpu + - libarrow 24.0.0 haea8852_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -24115,8 +24165,8 @@ packages: run_exports: weak: - libparquet >=24.0.0,<24.1.0a0 - size: 1119322 - timestamp: 1781910575147 + size: 1119810 + timestamp: 1782186109014 - conda: https://conda.anaconda.org/conda-forge/osx-64/libpng-1.6.58-he930e7c_0.conda sha256: a669b22978e546484d18d99a210801b1823360a266d7035c713d8d1facd035f7 md5: 9744d43d5200f284260637304a069ddd @@ -24684,6 +24734,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=hash-mapping run_exports: @@ -25311,6 +25362,7 @@ packages: - python >=3.14,<3.15.0a0 - python_abi 3.14.* *_cp314 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -25423,7 +25475,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 915832 timestamp: 1781007541495 @@ -26262,6 +26314,7 @@ packages: - libcurl >=8.20.0,<9.0a0 - aws-c-event-stream >=0.7.1,<0.7.2.0a0 license: Apache-2.0 + license_family: APACHE purls: [] run_exports: weak: @@ -26806,9 +26859,9 @@ packages: run_exports: {} size: 290405 timestamp: 1769156069514 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py310hb46c203_0.conda - sha256: e38460e3258cdc5ceb8ef61523e18b25d349ccb2b0ccc7af45e6bf2087b82112 - md5: 7ada5f3f83011282aaa49aae8b5953fd +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py310hb46c203_0.conda + sha256: 84955228de7326b188fcfb0a9465c0af10b2da304cbe6c233c2a032b475537c8 + md5: 7ea8141e34515df14cee891b551d799a depends: - __osx >=11.0 - python >=3.10,<3.11.0a0 @@ -26817,13 +26870,13 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 318364 - timestamp: 1781985300027 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py311hc290fe0_0.conda - sha256: 2f8fbe4e8bfeefbd15260377ad4060162907c5612df8ad9c768ef32f1800c2a8 - md5: eb1d9dcb67dc3e2a3aad41f270b1467c + size: 317808 + timestamp: 1782178475278 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py311hc290fe0_0.conda + sha256: 27306b6a944eae0565fb7d4483b7de287f739b1e8f11363e4b259b4b54b02d32 + md5: 3d7e96733f48d9623ab9cef29c45e7c0 depends: - __osx >=11.0 - python >=3.11,<3.12.0a0 @@ -26834,11 +26887,11 @@ packages: purls: - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 404033 - timestamp: 1781985211910 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py313h65a2061_0.conda - sha256: fb76acfbada2ffc52d9b13d4dd7c7fd16086575b489cfb7844015e8420184e93 - md5: a59d9ddc4f49c5005ff7e627dfb5f885 + size: 403734 + timestamp: 1782178661341 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py313h65a2061_0.conda + sha256: d2730d071b5c0f1a000b04f513b6d2d949684a500d569ce9bf7ed5ffab5596e0 + md5: b087cd0441275628d2f3d59744f86316 depends: - __osx >=11.0 - python >=3.13,<3.14.0a0 @@ -26847,13 +26900,13 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 400789 - timestamp: 1781985404040 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.2-py314h6e9b3f0_0.conda - sha256: 42b307c81b551b2a2ae3a7779b94e9bae7f1f52fef762e705cc856c07b47065f - md5: 57666e340ea35bedd959a1aefbe716a5 + size: 401430 + timestamp: 1782178606926 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/coverage-7.14.3-py314h6e9b3f0_0.conda + sha256: 8de3938e100bbbd775d3b0c6d121c29e7aa2cd0024a9b5ee8023706c0b8f28e6 + md5: cb60422148fa8d70cbb6fc290e79e21c depends: - __osx >=11.0 - python >=3.14,<3.15.0a0 @@ -26862,10 +26915,10 @@ packages: - tomli license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 418640 - timestamp: 1781985167562 + size: 417888 + timestamp: 1782178502982 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/debugpy-1.8.21-py313h1188861_0.conda sha256: 603ed94c0c45089b4c93f04b00444322b7e154a7cf73135c8e494b0e4eefc4d9 md5: 7d6048d219ebf46e96d44c077eb8cb44 @@ -26878,7 +26931,7 @@ packages: license: MIT license_family: MIT purls: - - pkg:pypi/debugpy?source=compressed-mapping + - pkg:pypi/debugpy?source=hash-mapping run_exports: {} size: 2754468 timestamp: 1780390249891 @@ -27661,10 +27714,10 @@ packages: - libarrow >=20.0.0,<20.1.0a0 size: 5649699 timestamp: 1774279750659 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h1caba66_7_cpu.conda - build_number: 7 - sha256: f9a33a46a7d7137dfdc0f9411cfeae451d1c7ed1f05211dbca8d8e189cfafa28 - md5: 8c0da832fe315fa6dbb54eb02d540441 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-24.0.0-h6045e8e_8_cpu.conda + build_number: 8 + sha256: 3fb75f077b1b1a2fc577e96c7c7bbb149c38819c3807795d4919fdfbf4d35f9e + md5: f4c6a48b3bf53c59a0f3ee5f5b492c62 depends: - __osx >=11.0 - aws-crt-cpp >=0.40.1,<0.40.2.0a0 @@ -27680,8 +27733,8 @@ packages: - libbrotlidec >=1.2.0,<1.3.0a0 - libbrotlienc >=1.2.0,<1.3.0a0 - libcxx >=21 - - libgoogle-cloud >=3.5.0,<3.6.0a0 - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - libzlib >=1.3.2,<2.0a0 @@ -27690,16 +27743,16 @@ packages: - snappy >=1.2.2,<1.3.0a0 - zstd >=1.5.7,<1.6.0a0 constrains: + - apache-arrow-proc =*=cpu - arrow-cpp <0.0a0 - parquet-cpp <0.0a0 - - apache-arrow-proc =*=cpu license: Apache-2.0 purls: [] run_exports: weak: - libarrow >=24.0.0,<24.1.0a0 - size: 4251232 - timestamp: 1781909232203 + size: 4262820 + timestamp: 1782184446658 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-15.0.2-hb0f823f_55_cpu.conda build_number: 55 sha256: 0499863afea289a460646ec5fc155c5dd0fba81802b6978dba7fc6a2ac322062 @@ -27736,16 +27789,16 @@ packages: - libarrow-acero >=20.0.0,<20.1.0a0 size: 511880 timestamp: 1774279965265 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_7_cpu.conda - build_number: 7 - sha256: c990529616309850ec3b5ceb14fe51710ec787528423a3c87a2bc27922623ec1 - md5: f76650b0e81e5afdc84cf38647ebc3e4 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-acero-24.0.0-ha4f4840_8_cpu.conda + build_number: 8 + sha256: 46c20e39c6104cba3c74c781671933ccdebb958131003ff61d50a552e0b8f8e6 + md5: 39a30fa1ac563d314ea65741e6761739 depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h1caba66_7_cpu - - libarrow-compute 24.0.0 h8d10c55_7_cpu + - libarrow 24.0.0 h6045e8e_8_cpu + - libarrow-compute 24.0.0 h8d10c55_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -27754,17 +27807,17 @@ packages: run_exports: weak: - libarrow-acero >=24.0.0,<24.1.0a0 - size: 520078 - timestamp: 1781909741500 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_7_cpu.conda - build_number: 7 - sha256: ee2efa4ee262f8d2dbef81e37bbc018980dd7b85fc68e118e4f2b7c1b2500772 - md5: f3c8ab2c55e91c32df74428ff8c24468 + size: 519849 + timestamp: 1782184880774 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-compute-24.0.0-h8d10c55_8_cpu.conda + build_number: 8 + sha256: 9f73c59cfbe680f0328098f20e4116df36b45484a204743813ebec03e8a70a2a + md5: 8d9f3dc3909108ddf4bc5411623b2ccc depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h1caba66_7_cpu + - libarrow 24.0.0 h6045e8e_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -27776,8 +27829,8 @@ packages: run_exports: weak: - libarrow-compute >=24.0.0,<24.1.0a0 - size: 2240794 - timestamp: 1781909390570 + size: 2243840 + timestamp: 1782184563897 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-15.0.2-hb0f823f_55_cpu.conda build_number: 55 sha256: 2ab158326d3eddc3714d5b1c326e90e8c6c80d009bc321164d128e4ae8170c3b @@ -27818,28 +27871,28 @@ packages: - libarrow-dataset >=20.0.0,<20.1.0a0 size: 513371 timestamp: 1774280294550 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_7_cpu.conda - build_number: 7 - sha256: 2be94c8f7710ac5aea5ca523c8231f8134e69afc5851b135856a0fbfac0627df - md5: 7fbefdfed50106cad702c996715e3167 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-dataset-24.0.0-ha4f4840_8_cpu.conda + build_number: 8 + sha256: d1db32a0d814236188945bee083e49d578dc0b35715367c5823b7125d7aeee25 + md5: f22bc2e04525bee90f5e89c2661f796f depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h1caba66_7_cpu - - libarrow-acero 24.0.0 ha4f4840_7_cpu - - libarrow-compute 24.0.0 h8d10c55_7_cpu + - libarrow 24.0.0 h6045e8e_8_cpu + - libarrow-acero 24.0.0 ha4f4840_8_cpu + - libarrow-compute 24.0.0 h8d10c55_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - - libparquet 24.0.0 h840b369_7_cpu + - libparquet 24.0.0 h840b369_8_cpu - libprotobuf >=6.33.5,<6.33.6.0a0 license: Apache-2.0 purls: [] run_exports: weak: - libarrow-dataset >=24.0.0,<24.1.0a0 - size: 518400 - timestamp: 1781909947875 + size: 518870 + timestamp: 1782185056190 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-flight-15.0.2-h302cddd_55_cpu.conda build_number: 55 sha256: ab752b40d3db15d08bbc38aaaed722764525353c8789c6848fb1bc0785a42558 @@ -27939,17 +27992,17 @@ packages: - libarrow-substrait >=20.0.0,<20.1.0a0 size: 449428 timestamp: 1774280565431 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_7_cpu.conda - build_number: 7 - sha256: 49cdd6974804c0b8848da7eafb01d252cbd72164ab9a8007c8b08a54e3b98d87 - md5: 46fa6fe2ecf18c912d4e39bbcc39df2c +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libarrow-substrait-24.0.0-h05be00f_8_cpu.conda + build_number: 8 + sha256: 1ae21b67f081aea9f136141b95691e2e98596ab733bbc261577f998dd08f88bf + md5: 6d82f177aff9e47fed86ae793318b4ad depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h1caba66_7_cpu - - libarrow-acero 24.0.0 ha4f4840_7_cpu - - libarrow-dataset 24.0.0 ha4f4840_7_cpu + - libarrow 24.0.0 h6045e8e_8_cpu + - libarrow-acero 24.0.0 ha4f4840_8_cpu + - libarrow-dataset 24.0.0 ha4f4840_8_cpu - libcxx >=21 - libprotobuf >=6.33.5,<6.33.6.0a0 license: Apache-2.0 @@ -27957,8 +28010,8 @@ packages: run_exports: weak: - libarrow-substrait >=24.0.0,<24.1.0a0 - size: 454600 - timestamp: 1781910030883 + size: 454799 + timestamp: 1782185112950 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libblas-3.11.0-8_h51639a9_openblas.conda build_number: 8 sha256: 8f5ec18ead0619a9cf0f38b49796c22f6fc0f44850c0df2baea0f5277db16e75 @@ -28416,9 +28469,9 @@ packages: - libgoogle-cloud >=3.3.0,<3.4.0a0 size: 1773417 timestamp: 1774214139261 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.5.0-h688a705_1.conda - sha256: 20235ded7b8d125461a9ed5e02f174eae89e85a271d3343167015f779ebc4714 - md5: 3899a5a69da373a85e7f53be3d32b814 +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-3.6.0-h688a705_0.conda + sha256: 650f0605bed3048ca69b547cc31e1d6c70b7371fb3212b00b103da6fd2f11d77 + md5: be005bcbd77890a199ee583ba7a74bec depends: - __osx >=11.0 - libabseil * cxx17* @@ -28428,17 +28481,17 @@ packages: - libgrpc >=1.78.1,<1.79.0a0 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - - openssl >=3.5.6,<4.0a0 + - openssl >=3.5.7,<4.0a0 constrains: - - libgoogle-cloud 3.5.0 *_1 + - libgoogle-cloud 3.6.0 *_0 license: Apache-2.0 license_family: Apache purls: [] run_exports: weak: - - libgoogle-cloud >=3.5.0,<3.6.0a0 - size: 1812401 - timestamp: 1780031033935 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + size: 1839082 + timestamp: 1781921657626 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-2.34.0-h7081f7f_0.conda sha256: 79f6b93fb330728530036b2b38764e9d42e0eedd3ae7e549ac7eae49acd1e52b md5: f09cb03f9cf847f1dc41b4c1f65c97c2 @@ -28479,16 +28532,16 @@ packages: - libgoogle-cloud-storage >=3.3.0,<3.4.0a0 size: 523970 timestamp: 1774214725148 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.5.0-ha114238_1.conda - sha256: 40b7074e3837fe3dcebef0e93f1f40fb995abd94787e51d231d31142e157dadd - md5: ecc3983f92594b3863a7e5d47d1a71ba +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgoogle-cloud-storage-3.6.0-ha114238_0.conda + sha256: a01821942ab88a433a81ec9a0e6aa72ed5f7ceb5e11a295f3eb28dd22a0a24bf + md5: b7c9ec99242e620133106438ccfcf08e depends: - __osx >=11.0 - libabseil - libcrc32c >=1.1.2,<1.2.0a0 - libcurl - libcxx >=19 - - libgoogle-cloud 3.5.0 h688a705_1 + - libgoogle-cloud 3.6.0 h688a705_0 - libzlib >=1.3.2,<2.0a0 - openssl license: Apache-2.0 @@ -28496,9 +28549,9 @@ packages: purls: [] run_exports: weak: - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 - size: 527597 - timestamp: 1780031485452 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 + size: 528490 + timestamp: 1781921815114 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libgrpc-1.67.1-h0a426d6_2.conda sha256: a6114f6020f02387aa8bc9167d77c23177f8a3650b55fb0ee100c5227ca475f9 md5: c368d17cdc54d96aa6bd73d07816cf60 @@ -28822,15 +28875,15 @@ packages: - libparquet >=20.0.0,<20.1.0a0 size: 906358 timestamp: 1774280214549 -- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_7_cpu.conda - build_number: 7 - sha256: 9e0ca4e1e84a823ec2b85ae33028353ac8e0896ae98d4b48258eebc4926afa51 - md5: df79a126e560d6bba2f8711e0bac4cff +- conda: https://conda.anaconda.org/conda-forge/osx-arm64/libparquet-24.0.0-h840b369_8_cpu.conda + build_number: 8 + sha256: 1c09305b9799442be75ece3565ea9b25195fe7d8d038d68a483f55c70037b620 + md5: d1c458dee7223a02a095c27b86fa6fea depends: - __osx >=11.0 - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h1caba66_7_cpu + - libarrow 24.0.0 h6045e8e_8_cpu - libcxx >=21 - libopentelemetry-cpp >=1.27.0,<1.28.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 @@ -28841,8 +28894,8 @@ packages: run_exports: weak: - libparquet >=24.0.0,<24.1.0a0 - size: 1097024 - timestamp: 1781909673584 + size: 1098749 + timestamp: 1782184832374 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libpng-1.6.58-h132b30e_0.conda sha256: 66eae34546df1f098a67064970c92aa14ae7a7505091889e00468294d2882c36 md5: 2259ae0949dbe20c0665850365109b27 @@ -29705,6 +29758,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -29726,6 +29780,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -31022,7 +31077,7 @@ packages: license: BSD-3-Clause license_family: BSD purls: - - pkg:pypi/scikit-learn?source=hash-mapping + - pkg:pypi/scikit-learn?source=compressed-mapping run_exports: {} size: 9668485 timestamp: 1780401272693 @@ -31112,6 +31167,7 @@ packages: - python >=3.13,<3.14.0a0 - python_abi 3.13.* *_cp313 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -31134,6 +31190,7 @@ packages: - python >=3.14,<3.15.0a0 - python_abi 3.14.* *_cp314 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -31341,7 +31398,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 881244 timestamp: 1781007287281 @@ -31356,7 +31413,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=compressed-mapping + - pkg:pypi/tornado?source=hash-mapping run_exports: {} size: 889689 timestamp: 1781007967544 @@ -31371,7 +31428,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 915857 timestamp: 1781007345425 @@ -31638,7 +31695,7 @@ packages: license: MIT AND Apache-2.0 license_family: Apache purls: - - pkg:pypi/aiohttp?source=hash-mapping + - pkg:pypi/aiohttp?source=compressed-mapping run_exports: {} size: 1028246 timestamp: 1780913507305 @@ -32364,6 +32421,7 @@ packages: - libzlib >=1.3.2,<2.0a0 - aws-c-event-stream >=0.7.1,<0.7.2.0a0 license: Apache-2.0 + license_family: APACHE purls: [] run_exports: weak: @@ -32819,9 +32877,9 @@ packages: run_exports: {} size: 247437 timestamp: 1769155978556 -- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py310hdb0e946_0.conda - sha256: fcbb840b8862362872bce4e5f90b908f4c474f8bd1849812e6fbd4ca9977429d - md5: 16341aa5e1f32cc28dbb91c795df6a07 +- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py310hdb0e946_0.conda + sha256: 920a457a997e0d406ef3ddc523c0f6e71b9ae820f067cb4f8c921d2236a2dd34 + md5: 1902f106da2e2c74a630106eb46cbac0 depends: - python >=3.10,<3.11.0a0 - python_abi 3.10.* *_cp310 @@ -32831,13 +32889,13 @@ packages: - vc14_runtime >=14.44.35208 license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 343581 - timestamp: 1781984981795 -- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py311h3f79411_0.conda - sha256: 908a8ad379b7a2f0df950b25ab1f75499f4d54fd49a006d9d97b4f9701a3cd15 - md5: 8484b4cd6933ff03b766682d0d8c9f53 + size: 342590 + timestamp: 1782178172908 +- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py311h3f79411_0.conda + sha256: e97365777e3c50d7de2dd463ab2d5565415ee95f033b72e0c223c5e43f2e4fb2 + md5: bc45a9bc9b619ca4b987f1e968500242 depends: - python >=3.11,<3.12.0a0 - python_abi 3.11.* *_cp311 @@ -32849,11 +32907,11 @@ packages: purls: - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 430053 - timestamp: 1781984977603 -- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py313hd650c13_0.conda - sha256: 73aec8e7552a9c2ddff9e1aa311c09f586e8a789f878827a8bcef905904fa562 - md5: 27cb3c4806920cb7a6c7c390f43de1b6 + size: 429286 + timestamp: 1782178181364 +- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py313hd650c13_0.conda + sha256: 6f625e0bb29dc70030c68c897f2536f275d5fa1e1e008bda0412baa76185e4e7 + md5: 2182f8aff2b7bcabe704336e943eaaa9 depends: - python >=3.13,<3.14.0a0 - python_abi 3.13.* *_cp313 @@ -32865,11 +32923,11 @@ packages: purls: - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 426266 - timestamp: 1781984986944 -- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.2-py314h2359020_0.conda - sha256: 685827ede3b53a2adf75eeebcc23ad5808023bd0d78984718fe3e8c0a2a74ee9 - md5: f4f027fcc72c0b6cc7a12d153f30144f + size: 424963 + timestamp: 1782178171864 +- conda: https://conda.anaconda.org/conda-forge/win-64/coverage-7.14.3-py314h2359020_0.conda + sha256: 1879a89f00d90166db5d3eaa455aa329f5a2fbeab5662630548e477047074125 + md5: 2d2497e3c3b759375810139e7ccc0328 depends: - python >=3.14,<3.15.0a0 - python_abi 3.14.* *_cp314 @@ -32879,10 +32937,10 @@ packages: - vc14_runtime >=14.44.35208 license: Apache-2.0 purls: - - pkg:pypi/coverage?source=compressed-mapping + - pkg:pypi/coverage?source=hash-mapping run_exports: {} - size: 443892 - timestamp: 1781984983371 + size: 443224 + timestamp: 1782178194098 - conda: https://conda.anaconda.org/conda-forge/win-64/debugpy-1.8.21-py313h927ade5_0.conda sha256: 53814b871aa4996ed1254da1580eeb4c78d94b61bca7acd0b2e452ea1529ded0 md5: 647dafaeb1aa25808079a6d8e534b09d @@ -33306,7 +33364,7 @@ packages: license: Apache-2.0 license_family: APACHE purls: - - pkg:pypi/hf-xet?source=compressed-mapping + - pkg:pypi/hf-xet?source=hash-mapping run_exports: {} size: 3508553 timestamp: 1781767622373 @@ -33555,10 +33613,10 @@ packages: - libarrow >=20.0.0,<20.1.0a0 size: 5596071 timestamp: 1774283478907 -- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h54e786e_7_cpu.conda - build_number: 7 - sha256: 5725d734c9909f950b8d3785a95e70f1a548aae69fcbe06ebe9d36c03b2df599 - md5: 2ea4a6b55aad82b24e9053183f91fbfb +- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-24.0.0-h9dce539_8_cpu.conda + build_number: 8 + sha256: 0881121701206aa0f1eeb5e1be1d5477eb280e5263d12e36a42e60825e946f70 + md5: e462e521999f15f3a9167044ce93a11a depends: - aws-crt-cpp >=0.40.1,<0.40.2.0a0 - aws-sdk-cpp >=1.11.747,<1.11.748.0a0 @@ -33573,8 +33631,8 @@ packages: - libbrotlienc >=1.2.0,<1.3.0a0 - libcrc32c >=1.1.2,<1.2.0a0 - libcurl >=8.20.0,<9.0a0 - - libgoogle-cloud >=3.5.0,<3.6.0a0 - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 - libprotobuf >=6.33.5,<6.33.6.0a0 - libzlib >=1.3.2,<2.0a0 - lz4-c >=1.10.0,<1.11.0a0 @@ -33585,16 +33643,16 @@ packages: - vc14_runtime >=14.44.35208 - zstd >=1.5.7,<1.6.0a0 constrains: - - apache-arrow-proc =*=cpu - parquet-cpp <0.0a0 - arrow-cpp <0.0a0 + - apache-arrow-proc =*=cpu license: Apache-2.0 purls: [] run_exports: weak: - libarrow >=24.0.0,<24.1.0a0 - size: 4346032 - timestamp: 1781911707919 + size: 4395327 + timestamp: 1782188286274 - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-15.0.2-h7d8d6a5_55_cpu.conda build_number: 55 sha256: b715f14f3f5be637bab8a6cb4aeadd52333c14385431f212f35090c282a59b2a @@ -33629,13 +33687,13 @@ packages: - libarrow-acero >=20.0.0,<20.1.0a0 size: 466450 timestamp: 1774283598578 -- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_7_cpu.conda - build_number: 7 - sha256: e61b7ae4a863dc00608c028fef4f9c1d64c2176d26127299515d1c89898a416b - md5: c87e3bf7cf6aa1046dc4a959fd5087ee +- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-acero-24.0.0-h7d8d6a5_8_cpu.conda + build_number: 8 + sha256: 0c3421c60a0a3b38d368c4fea9468086c845e62f3fd5e4ee0ffef87ce3c62f32 + md5: fdf4cb9cd72e1c0053d63f18e5c1ff0e depends: - - libarrow 24.0.0 h54e786e_7_cpu - - libarrow-compute 24.0.0 h081cd8e_7_cpu + - libarrow 24.0.0 h9dce539_8_cpu + - libarrow-compute 24.0.0 h081cd8e_8_cpu - ucrt >=10.0.20348.0 - vc >=14.3,<15 - vc14_runtime >=14.44.35208 @@ -33644,14 +33702,14 @@ packages: run_exports: weak: - libarrow-acero >=24.0.0,<24.1.0a0 - size: 446672 - timestamp: 1781911951828 -- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_7_cpu.conda - build_number: 7 - sha256: 9cf68272aa13fa4e4591b0868a47e6d173ee2409b089fca18c4ed12fbb3d3e83 - md5: 17e1abffcee7c79d189449e5e5034d2b + size: 446066 + timestamp: 1782188561191 +- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-compute-24.0.0-h081cd8e_8_cpu.conda + build_number: 8 + sha256: ee7eec54a2f1538ff117e37046af31812e2ac98ae60abe049fe0f72d26562aa2 + md5: 4e25e77f9d106635ebb0999283898169 depends: - - libarrow 24.0.0 h54e786e_7_cpu + - libarrow 24.0.0 h9dce539_8_cpu - libre2-11 >=2025.11.5 - libutf8proc >=2.11.3,<2.12.0a0 - re2 @@ -33663,8 +33721,8 @@ packages: run_exports: weak: - libarrow-compute >=24.0.0,<24.1.0a0 - size: 1753785 - timestamp: 1781911783917 + size: 1755117 + timestamp: 1782188380565 - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-15.0.2-h7d8d6a5_55_cpu.conda build_number: 55 sha256: 208d53026f5ff186df2c0da0ab5c10b8419288e83f3e322c58a286f26780c829 @@ -33703,15 +33761,15 @@ packages: - libarrow-dataset >=20.0.0,<20.1.0a0 size: 451589 timestamp: 1774283813404 -- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_7_cpu.conda - build_number: 7 - sha256: c08bf0563e069be701aa7722045d4076063bd90b08b12f9bbd6fea3d68e27da9 - md5: 34a6b857e6b10fd7f0412c3bb0500d3a +- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-dataset-24.0.0-h7d8d6a5_8_cpu.conda + build_number: 8 + sha256: fd80c9b7062d45e1b696975ba3ade7b568cad9c5d65437bab52324fa0f0cb931 + md5: 5e3bba785b5005c49442ab808fb92679 depends: - - libarrow 24.0.0 h54e786e_7_cpu - - libarrow-acero 24.0.0 h7d8d6a5_7_cpu - - libarrow-compute 24.0.0 h081cd8e_7_cpu - - libparquet 24.0.0 h7051d1f_7_cpu + - libarrow 24.0.0 h9dce539_8_cpu + - libarrow-acero 24.0.0 h7d8d6a5_8_cpu + - libarrow-compute 24.0.0 h081cd8e_8_cpu + - libparquet 24.0.0 h7051d1f_8_cpu - ucrt >=10.0.20348.0 - vc >=14.3,<15 - vc14_runtime >=14.44.35208 @@ -33720,8 +33778,8 @@ packages: run_exports: weak: - libarrow-dataset >=24.0.0,<24.1.0a0 - size: 428740 - timestamp: 1781912054670 + size: 428335 + timestamp: 1782188679589 - conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-flight-15.0.2-h3601c32_55_cpu.conda build_number: 55 sha256: ed0100a5ab2d8ffe4e23729a32ab1adfb47396a3a324baec38db49d24c651aa0 @@ -33829,16 +33887,16 @@ packages: - libarrow-substrait >=20.0.0,<20.1.0a0 size: 369202 timestamp: 1774283981103 -- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_7_cpu.conda - build_number: 7 - sha256: e19bdb3954bcd65262e205e7c69fb6ffdfd59d46874c694391e48ee3c70aa676 - md5: ed2a58e4bd009e41e07622908ef2cb76 +- conda: https://conda.anaconda.org/conda-forge/win-64/libarrow-substrait-24.0.0-h524e9bd_8_cpu.conda + build_number: 8 + sha256: a6419bd428e449d340229369529806b2d38ba3ed2030a6db495a9d0ea054b58d + md5: 26cc2bc5c369c96eeaf10289e80970d9 depends: - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 - - libarrow 24.0.0 h54e786e_7_cpu - - libarrow-acero 24.0.0 h7d8d6a5_7_cpu - - libarrow-dataset 24.0.0 h7d8d6a5_7_cpu + - libarrow 24.0.0 h9dce539_8_cpu + - libarrow-acero 24.0.0 h7d8d6a5_8_cpu + - libarrow-dataset 24.0.0 h7d8d6a5_8_cpu - libprotobuf >=6.33.5,<6.33.6.0a0 - ucrt >=10.0.20348.0 - vc >=14.3,<15 @@ -33848,8 +33906,8 @@ packages: run_exports: weak: - libarrow-substrait >=24.0.0,<24.1.0a0 - size: 362535 - timestamp: 1781912089151 + size: 361992 + timestamp: 1782188719264 - conda: https://conda.anaconda.org/conda-forge/win-64/libblas-3.11.0-8_h8455456_mkl.conda build_number: 8 sha256: 43a87b59e6d4c68d80b2e4de487b1b54d66fe1f9a06636909b5a5ab9eae27269 @@ -34279,9 +34337,9 @@ packages: - libgoogle-cloud >=3.3.0,<3.4.0a0 size: 17141 timestamp: 1774217556612 -- conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.5.0-he22669a_1.conda - sha256: 3904d8f8a0bddc5b5baa534048c2633375b04337c14c3416c446bd6f667a5805 - md5: 526136b0b872c2841e5947be047dadee +- conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-3.6.0-he22669a_0.conda + sha256: 11cb7ec822abcf6feedcd778f1f71d889b4c1b270949927aa468a6b24abe230d + md5: 24239981d980d39030515bc50f696e2f depends: - libabseil * cxx17* - libabseil >=20260107.1,<20260108.0a0 @@ -34293,15 +34351,15 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 constrains: - - libgoogle-cloud 3.5.0 *_1 + - libgoogle-cloud 3.6.0 *_0 license: Apache-2.0 license_family: Apache purls: [] run_exports: weak: - - libgoogle-cloud >=3.5.0,<3.6.0a0 - size: 18087 - timestamp: 1780034913635 + - libgoogle-cloud >=3.6.0,<3.7.0a0 + size: 17255 + timestamp: 1781928103484 - conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-2.34.0-he5eb982_0.conda sha256: e98eda80a657ae4271eca189e617c740aed806b4c357cf02df3b29b7c481a4ed md5: c9a65d04330bb5c9282d7ddb209b0c56 @@ -34342,14 +34400,14 @@ packages: - libgoogle-cloud-storage >=3.3.0,<3.4.0a0 size: 17112 timestamp: 1774217996193 -- conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.5.0-he04ea4c_1.conda - sha256: 90c9e66fc403ee42d1fb23dafb5873712bc89b103c22d963ebf932bce6cffefc - md5: 7249500fac23f02b60b773878e4668b1 +- conda: https://conda.anaconda.org/conda-forge/win-64/libgoogle-cloud-storage-3.6.0-he04ea4c_0.conda + sha256: 5c62334045397f97437f7cb2d44672914eb2facc12839aaac87386f2aeebd05b + md5: 0f4a153aa13c9a98de0be992cfd9f6bb depends: - libabseil - libcrc32c >=1.1.2,<1.2.0a0 - libcurl - - libgoogle-cloud 3.5.0 he22669a_1 + - libgoogle-cloud 3.6.0 he22669a_0 - libzlib >=1.3.2,<2.0a0 - ucrt >=10.0.20348.0 - vc >=14.3,<15 @@ -34359,9 +34417,9 @@ packages: purls: [] run_exports: weak: - - libgoogle-cloud-storage >=3.5.0,<3.6.0a0 - size: 18067 - timestamp: 1780035234126 + - libgoogle-cloud-storage >=3.6.0,<3.7.0a0 + size: 17216 + timestamp: 1781928427840 - conda: https://conda.anaconda.org/conda-forge/win-64/libgrpc-1.67.1-h0ac93cb_2.conda sha256: 096b08185da8c11fdc30f6e117fdf7ad5bff6535b2698428de7c96fdbe23ca29 md5: ec35578e8658d5f720b6180211276ca6 @@ -34687,12 +34745,12 @@ packages: - libparquet >=20.0.0,<20.1.0a0 size: 841340 timestamp: 1774283764941 -- conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_7_cpu.conda - build_number: 7 - sha256: 303cc0ca829eab3d2d7852217cfa76c56983f4d8b5400a6b9b53a5b359a78c92 - md5: 2f523c93d99a8a68d3f8c25e5d5b2ac6 +- conda: https://conda.anaconda.org/conda-forge/win-64/libparquet-24.0.0-h7051d1f_8_cpu.conda + build_number: 8 + sha256: 0e7710a5b804f0e202a89d03a8fc587d0c795a2e9a050fb8d4765a3b4d2bd1f5 + md5: 743708ec12c6c7ae1570b80e0f0067e9 depends: - - libarrow 24.0.0 h54e786e_7_cpu + - libarrow 24.0.0 h9dce539_8_cpu - libthrift >=0.22.0,<0.22.1.0a0 - openssl >=3.5.7,<4.0a0 - ucrt >=10.0.20348.0 @@ -34703,8 +34761,8 @@ packages: run_exports: weak: - libparquet >=24.0.0,<24.1.0a0 - size: 966550 - timestamp: 1781911917583 + size: 966046 + timestamp: 1782188523630 - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.58-h7351971_0.conda sha256: 218913aeee391460bd0e341b834dbd9c6fa6ae0a4276c0c300266cc99a816a28 md5: 52f1280563f3b48b5f75414cd2d15dd1 @@ -35433,7 +35491,7 @@ packages: license: PSF-2.0 license_family: PSF purls: - - pkg:pypi/matplotlib?source=hash-mapping + - pkg:pypi/matplotlib?source=compressed-mapping run_exports: {} size: 8803186 timestamp: 1781627107274 @@ -35700,6 +35758,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -35722,6 +35781,7 @@ packages: constrains: - numpy-base <0a0 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/numpy?source=compressed-mapping run_exports: @@ -37291,6 +37351,7 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -37312,6 +37373,7 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: BSD-3-Clause + license_family: BSD purls: - pkg:pypi/scipy?source=compressed-mapping run_exports: {} @@ -37608,7 +37670,7 @@ packages: license: Apache-2.0 license_family: Apache purls: - - pkg:pypi/tornado?source=hash-mapping + - pkg:pypi/tornado?source=compressed-mapping run_exports: {} size: 919275 timestamp: 1781006902968 @@ -38216,6 +38278,8 @@ packages: - sphinx-copybutton ; extra == 'dev' - sphinx-gallery ; extra == 'dev' - sphinxext-opengraph ; extra == 'dev' + - sphinx-llms-txt ; extra == 'dev' + - sphinx-markdown-builder ; extra == 'dev' - sphinx-autosummary-accessors ; extra == 'dev' - statsmodels ; extra == 'dev' - ruff==0.15.0 ; extra == 'dev' @@ -39011,24 +39075,24 @@ packages: - trove-classifiers>=2024.10.12 ; extra == 'tests' - defusedxml ; extra == 'xmp' requires_python: '>=3.10' -- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-macosx_12_0_arm64.whl +- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-macosx_12_0_arm64.whl name: pyarrow - version: 25.0.0.dev169 + version: 25.0.0.dev171 index: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple requires_python: '>=3.10' -- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-macosx_12_0_x86_64.whl +- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-macosx_12_0_x86_64.whl name: pyarrow - version: 25.0.0.dev169 + version: 25.0.0.dev171 index: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple requires_python: '>=3.10' -- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-manylinux_2_28_x86_64.whl +- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-manylinux_2_28_x86_64.whl name: pyarrow - version: 25.0.0.dev169 + version: 25.0.0.dev171 index: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple requires_python: '>=3.10' -- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev169/pyarrow-25.0.0.dev169-cp314-cp314-win_amd64.whl +- pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/pyarrow/25.0.0.dev171/pyarrow-25.0.0.dev171-cp314-cp314-win_amd64.whl name: pyarrow - version: 25.0.0.dev169 + version: 25.0.0.dev171 index: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple requires_python: '>=3.10' - pypi: https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn/1.10.dev0/scikit_learn-1.10.dev0-cp314-cp314-macosx_10_15_x86_64.whl diff --git a/pyproject.toml b/pyproject.toml index 38991146f..f3b04a79e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,8 @@ dev = [ "sphinx-copybutton", "sphinx-gallery", "sphinxext-opengraph", + "sphinx-llms-txt", + "sphinx-markdown-builder", "sphinx-autosummary-accessors", "statsmodels", @@ -91,6 +93,9 @@ Issues = "https://github.com/skrub-data/skrub/issues" [tool.setuptools] packages = ["skrub"] +[tool.setuptools.package-data] +skrub = ["_docs/**/*.rst", "_docs/**/*.py"] + [tool.pixi.workspace] channels = ["conda-forge", "pytorch"] platforms = ["linux-64", "osx-arm64", "osx-64", "win-64"] @@ -124,6 +129,8 @@ sphinx-gallery = "*" sphinxext-opengraph = "*" sphinx-autosummary-accessors = ">=2025.3.1,<2026" sphinx-sitemap = "*" +sphinx-llms-txt = "*" +sphinx-markdown-builder = "*" statsmodels = "*" optuna = "*" skorch = "*" @@ -278,6 +285,7 @@ exclude = [ "dist", "doc/_build", "doc/auto_examples", + "skrub/_docs/*.py", "build", "pixi.lock", ] @@ -360,6 +368,7 @@ xfail_strict = true addopts = ["--doctest-modules", "--strict-config", "--strict-markers"] doctest_optionflags = "NORMALIZE_WHITESPACE ELLIPSIS" + [tool.codespell] # Ref: https://github.com/codespell-project/codespell#using-a-config-file skip = '.git*,*.svg,package-lock.json,*.lock,*.css,*-min.*,pyproject.toml' diff --git a/skrub/__init__.py b/skrub/__init__.py index f3c88747a..de0f2c2ce 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -6,14 +6,20 @@ data. It helps clean, encode, and transform dataframes into features ready for scikit-learn or other ML frameworks. -Docs: https://skrub-data.org/stable/reference/index.html -User Guide: https://skrub-data.org/stable/documentation.html +Bundled docs: ``skrub.__docs_dir__`` +Bundled getting started: ``skrub.__docs_dir__ / "tutorials"`` +Bundled examples: ``skrub.__docs_dir__ / "examples"`` + +Online docs: https://skrub-data.org/stable/reference/index.html Source: https://github.com/skrub-data/skrub/ -Examples: https://skrub-data.org/stable/auto_examples/index.html """ from pathlib import Path as _Path +#: Path to the documentation bundled with the package. +#: Use ``skrub.__docs_dir__`` to access it programmatically. +__docs_dir__ = _Path(__file__).parent / "_docs" + from . import core, selectors from ._agg_joiner import AggJoiner, AggTarget from ._apply_to_cols import ApplyToCols @@ -117,4 +123,5 @@ "InterpolationJoiner", "config_context", "core", + "__docs_dir__", ] diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 7168cf70b..64d34e9cf 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -216,6 +216,25 @@ class ApplyToCols(TransformerMixin, SkrubBaseEstimator): skrub.core.RejectColumn: Column 'A' does not have Date or Datetime dtype. Transformer DatetimeEncoder.fit_transform failed on column 'A'. See above for the full traceback. + It is also possible to wrap a :class:`TableVectorizer` or :class:`Cleaner` in + ``ApplyToCols`` to select or exclude columns based on patterns. For example, + to apply a :class:`TableVectorizer` to all columns except those ending with "_id", + we can do: + + >>> import skrub.selectors as s + >>> from skrub import ApplyToCols, TableVectorizer + + >>> df = pd.DataFrame(dict( + ... user_id=["A001", "A002"], + ... age=[25, 30], + ... department=["Engineering", "Sales"], + ... )) + >>> tv = ApplyToCols(TableVectorizer(), cols=~s.glob("*_id")) + >>> tv.fit_transform(df) + user_id age department_Sales + 0 A001 25.0 0.0 + 1 A002 30.0 1.0 + **Accessing fitted transformers** Depending on the transformer, the fitted transformers diff --git a/skrub/_docs/about.rst b/skrub/_docs/about.rst new file mode 100644 index 000000000..b8f8a1bf0 --- /dev/null +++ b/skrub/_docs/about.rst @@ -0,0 +1,18 @@ + +About +----- + +skrub shares much of its DNA with `scikit-learn +`__. + +skrub is the continuation of `dirty-cat `_ +with a broader scope and greater ambition. + +skrub is a young project born from research. We welcome feedback +on successes and failures with the different techniques on real-world data, or +suggestions for open datasets on which we can do better examples and empirical work. + +skrub received funding from French research projects: `DirtyData +`_ (ANR-17-CE23-0018), `LearnI +`_ (ANR-20-CHIA-0026), and `P16 +`_. diff --git a/skrub/_docs/column_level_featurizing.rst b/skrub/_docs/column_level_featurizing.rst new file mode 100644 index 000000000..3c9e9c1f7 --- /dev/null +++ b/skrub/_docs/column_level_featurizing.rst @@ -0,0 +1,19 @@ +.. _user_guide_encoders_index: + +Column-level feature extraction +=============================== + +Skrub provides various transformers that help with feature engineering numeric, +datetime and categorical data. The encoders covered in this section convert the +raw features found in an input dataframe into numeric features that can be used +directly by machine learning models. + +.. include:: includes/big_toc_css.rst + +.. toctree:: + :maxdepth: 3 + + modules/column_level_featurizing/feature_engineering_categorical + modules/column_level_featurizing/feature_engineering_datetimes + modules/column_level_featurizing/feature_engineering_numerical + modules/column_level_featurizing/advanced_columnwise_operations diff --git a/skrub/_docs/data_ops.rst b/skrub/_docs/data_ops.rst new file mode 100644 index 000000000..31c832f54 --- /dev/null +++ b/skrub/_docs/data_ops.rst @@ -0,0 +1,97 @@ +.. _user_guide_data_ops_index: + +.. currentmodule:: skrub + +Building complete pipelines with DataOps +======================================== + +A skrub DataOp is a complete machine learning pipeline —from data loading and +wrangling to the final prediction— in a single object that can be fitted, tuned, +cross-validated, and saved in a file like any scikit-learn estimator. + +By integrating the whole data processing, DataOps help to validate pipelines +while **avoiding data leakage**, to **tune complex modelling choices**, and to keep +track of important **fitted (learned) state**. + +To solve a machine-learning task we often need to combine multiple operations +such as loading and filtering data, joining tables and computing aggregations, +extracting numerical features, and fitting a classifier or regressor. + +**Storing state**  Each of those operations may need to be fitted: to learn some +information from training data and reuse it to apply consistent transformations +to new data. This is the case for transformers like the +:class:`~sklearn.preprocessing.StandardScaler` and :class:`TableVectorizer` and +estimators like :class:`~sklearn.ensemble.RandomForestClassifier`. + +**Tuning**  Moreover, each processing step may involve decisions that need to be +tuned (*tuning* means finding the value that gives the best predictive +performance), for example: what weather forecast features should I include to +predict the load on an electric grid? How should I encode a product description +to help predict the product's category? What learning rate to set on a +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`? + +**Validation**  Finally, the quality of predictions must be evaluated on +held-out data (with a train/test split or cross-validation), taking care to +**avoid leakage** of test data into the training set. + +Separating the data wrangling from the fitted estimator prevents correctly +handling the tasks above. Skrub DataOps help by binding an arbitrary set of +transformations of any number of inputs in a single estimator. These +transformations can be easily parametrized with tunable choices. The resulting +objects have built-in methods for cross-validation and tuning with either Optuna +or scikit-learn, and for inspecting runs and intermediate results. Once fitted, +they can be saved in a file, loaded, applied to new data as easily as a single +:class:`~sklearn.linear_model.LogisticRegression`. + +.. dropdown:: Going beyond the scikit-learn Pipeline + :color: primary + + To some extent, the DataOps exist for the same reasons as the simpler + scikit-learn :class:`sklearn.pipeline.Pipeline` used in other parts of this + documentation. However the Pipeline is too limited for many real-world problems: + it can only represent a linear sequence of scikit-learn transformers, the design + matrix and target variables must be constructed and divided into training and + testing sets outside of the pipeline and the number of rows cannot change, only + a single table can be handled, hyperparameter choices are difficult to define, + etc. . Skrub DataOps remove those limitations and add several useful features + such as interactive previews and integration with Optuna. + +Data Ops basic concepts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 3 + + modules/data_ops/basics/what_are_data_ops + modules/data_ops/basics/building_data_ops_plan + auto_tutorials/1110_data_ops_intro + modules/data_ops/basics/using_previews + modules/data_ops/basics/direct_access_methods + modules/data_ops/basics/control_flow + modules/data_ops/basics/data_ops_vs_alternatives + +Building a complex pipeline with the skrub Data Ops +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 2 + + modules/data_ops/ml_pipeline/applying_ml_estimators + modules/data_ops/ml_pipeline/applying_different_transformers + modules/data_ops/ml_pipeline/documenting_data_ops_plan + modules/data_ops/ml_pipeline/evaluating_debugging_data_ops + modules/data_ops/ml_pipeline/using_part_of_data_ops_plan + modules/data_ops/ml_pipeline/subsampling_data + +Tuning and validating skrub DataOps plans +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 2 + + modules/data_ops/validation/tuning_validating_data_ops + modules/data_ops/validation/hyperparameter_tuning + modules/data_ops/validation/nested_cross_validation + modules/data_ops/validation/nesting_choices_choosing_pipelines + modules/data_ops/validation/exporting_data_ops + modules/data_ops/validation/tuning_with_optuna diff --git a/skrub/_docs/default_wrangling.rst b/skrub/_docs/default_wrangling.rst new file mode 100644 index 000000000..4ce00b846 --- /dev/null +++ b/skrub/_docs/default_wrangling.rst @@ -0,0 +1,17 @@ +.. _user_guide_building_pipeline_index: + +Wrangling data with good defaults +================================= + +This section covers how to build a predictive pipeline starting from a dataframe. +The skrub objects described in this section can be used as strong defaults for +building baseline pipelines, and can be customized for specific use cases. + + +.. toctree:: + :maxdepth: 3 + + modules/default_wrangling/cleaning_dataframes + modules/default_wrangling/table_vectorizer + modules/default_wrangling/tabular_pipeline + modules/default_wrangling/apply_to_cols diff --git a/skrub/_docs/development.rst b/skrub/_docs/development.rst new file mode 100644 index 000000000..15989bfe7 --- /dev/null +++ b/skrub/_docs/development.rst @@ -0,0 +1,18 @@ + +=========== +Development +=========== + +While ``skrub`` is still in its early stages, we believe in openness and +community development from the start. Join us in building a great package to +facilitate learning on databases. + +.. include:: includes/big_toc_css.rst + +.. toctree:: + + vision + about + CONTRIBUTING + tutorial_example + RELEASE_PROCESS diff --git a/skrub/_docs/documentation.rst b/skrub/_docs/documentation.rst new file mode 100644 index 000000000..1512bcf92 --- /dev/null +++ b/skrub/_docs/documentation.rst @@ -0,0 +1,27 @@ +.. _user_guide: + +User Guide +========== + +Skrub is a Python library that facilitates machine learning with tabular data +(dataframes, such as pandas and polars) using a scikit-learn-compatible API. + +Use the sections below to navigate the guide. For a quickstart example, +try :ref:`Getting Started `. +For runnable code, see the :doc:`Example gallery `. +For class and function details, see the :ref:`API Reference `. +For common use cases and how to address them, see the :ref:`How-to guides `. + + +.. include:: includes/big_toc_css.rst + +.. toctree:: + :maxdepth: 3 + + auto_tutorials/0000_getting_started + exploring_a_dataframe + default_wrangling + column_level_featurizing + multi_column_operations + data_ops + joining_dataframes diff --git a/examples/0010_apply_to_cols.py b/skrub/_docs/examples/0010_apply_to_cols.py similarity index 100% rename from examples/0010_apply_to_cols.py rename to skrub/_docs/examples/0010_apply_to_cols.py diff --git a/examples/0050_deduplication.py b/skrub/_docs/examples/0050_deduplication.py similarity index 100% rename from examples/0050_deduplication.py rename to skrub/_docs/examples/0050_deduplication.py diff --git a/examples/0100_squashing_scaler.py b/skrub/_docs/examples/0100_squashing_scaler.py similarity index 100% rename from examples/0100_squashing_scaler.py rename to skrub/_docs/examples/0100_squashing_scaler.py diff --git a/examples/01_encoding/0010_encodings.py b/skrub/_docs/examples/01_encoding/0010_encodings.py similarity index 100% rename from examples/01_encoding/0010_encodings.py rename to skrub/_docs/examples/01_encoding/0010_encodings.py diff --git a/examples/01_encoding/0020_text_with_string_encoders.py b/skrub/_docs/examples/01_encoding/0020_text_with_string_encoders.py similarity index 100% rename from examples/01_encoding/0020_text_with_string_encoders.py rename to skrub/_docs/examples/01_encoding/0020_text_with_string_encoders.py diff --git a/examples/01_encoding/0030_datetime_encoder.py b/skrub/_docs/examples/01_encoding/0030_datetime_encoder.py similarity index 100% rename from examples/01_encoding/0030_datetime_encoder.py rename to skrub/_docs/examples/01_encoding/0030_datetime_encoder.py diff --git a/examples/01_encoding/GALLERY_HEADER.rst b/skrub/_docs/examples/01_encoding/GALLERY_HEADER.rst similarity index 100% rename from examples/01_encoding/GALLERY_HEADER.rst rename to skrub/_docs/examples/01_encoding/GALLERY_HEADER.rst diff --git a/examples/02_data_ops/1120_multiple_tables.py b/skrub/_docs/examples/02_data_ops/1120_multiple_tables.py similarity index 100% rename from examples/02_data_ops/1120_multiple_tables.py rename to skrub/_docs/examples/02_data_ops/1120_multiple_tables.py diff --git a/examples/02_data_ops/1130_choices.py b/skrub/_docs/examples/02_data_ops/1130_choices.py similarity index 100% rename from examples/02_data_ops/1130_choices.py rename to skrub/_docs/examples/02_data_ops/1130_choices.py diff --git a/examples/02_data_ops/1131_optuna_choices.py b/skrub/_docs/examples/02_data_ops/1131_optuna_choices.py similarity index 100% rename from examples/02_data_ops/1131_optuna_choices.py rename to skrub/_docs/examples/02_data_ops/1131_optuna_choices.py diff --git a/examples/02_data_ops/1140_subsampling.py b/skrub/_docs/examples/02_data_ops/1140_subsampling.py similarity index 100% rename from examples/02_data_ops/1140_subsampling.py rename to skrub/_docs/examples/02_data_ops/1140_subsampling.py diff --git a/examples/02_data_ops/1150_use_case.py b/skrub/_docs/examples/02_data_ops/1150_use_case.py similarity index 100% rename from examples/02_data_ops/1150_use_case.py rename to skrub/_docs/examples/02_data_ops/1150_use_case.py diff --git a/examples/02_data_ops/1160_pytorch.py b/skrub/_docs/examples/02_data_ops/1160_pytorch.py similarity index 100% rename from examples/02_data_ops/1160_pytorch.py rename to skrub/_docs/examples/02_data_ops/1160_pytorch.py diff --git a/examples/02_data_ops/GALLERY_HEADER.rst b/skrub/_docs/examples/02_data_ops/GALLERY_HEADER.rst similarity index 100% rename from examples/02_data_ops/GALLERY_HEADER.rst rename to skrub/_docs/examples/02_data_ops/GALLERY_HEADER.rst diff --git a/examples/03_joining/0040_fuzzy_joining.py b/skrub/_docs/examples/03_joining/0040_fuzzy_joining.py similarity index 100% rename from examples/03_joining/0040_fuzzy_joining.py rename to skrub/_docs/examples/03_joining/0040_fuzzy_joining.py diff --git a/examples/03_joining/0060_multiple_key_join.py b/skrub/_docs/examples/03_joining/0060_multiple_key_join.py similarity index 100% rename from examples/03_joining/0060_multiple_key_join.py rename to skrub/_docs/examples/03_joining/0060_multiple_key_join.py diff --git a/examples/03_joining/0070_join_aggregation.py b/skrub/_docs/examples/03_joining/0070_join_aggregation.py similarity index 100% rename from examples/03_joining/0070_join_aggregation.py rename to skrub/_docs/examples/03_joining/0070_join_aggregation.py diff --git a/examples/03_joining/0080_interpolation_join.py b/skrub/_docs/examples/03_joining/0080_interpolation_join.py similarity index 100% rename from examples/03_joining/0080_interpolation_join.py rename to skrub/_docs/examples/03_joining/0080_interpolation_join.py diff --git a/examples/03_joining/GALLERY_HEADER.rst b/skrub/_docs/examples/03_joining/GALLERY_HEADER.rst similarity index 100% rename from examples/03_joining/GALLERY_HEADER.rst rename to skrub/_docs/examples/03_joining/GALLERY_HEADER.rst diff --git a/examples/GALLERY_HEADER.rst b/skrub/_docs/examples/GALLERY_HEADER.rst similarity index 100% rename from examples/GALLERY_HEADER.rst rename to skrub/_docs/examples/GALLERY_HEADER.rst diff --git a/skrub/_docs/exploring_a_dataframe.rst b/skrub/_docs/exploring_a_dataframe.rst new file mode 100644 index 000000000..03f8230fe --- /dev/null +++ b/skrub/_docs/exploring_a_dataframe.rst @@ -0,0 +1,13 @@ +.. _user_guide_exploring_a_dataframe_index: + +Exploring a Dataframe +===================== + +This section covers the :class:`~skrub.TableReport` and how it can be used for exploring +and understanding your dataframes. + + +.. toctree:: + :maxdepth: 3 + + modules/tablereport/exploring_dataframes_interactively diff --git a/skrub/_docs/guides/table_report/01_alter_appearance.rst b/skrub/_docs/guides/table_report/01_alter_appearance.rst new file mode 100644 index 000000000..4ba65d869 --- /dev/null +++ b/skrub/_docs/guides/table_report/01_alter_appearance.rst @@ -0,0 +1,25 @@ +.. |TableReport| replace:: :class:`~skrub.TableReport` +.. |set_config| replace:: :func:`~skrub.set_config` +.. |column_associations| replace:: :func:`~skrub.column_associations` + +.. _user_guide_table_report_customize: + +How to tweak the Appearance of the |TableReport| +------------------------------------------------ + +The skrub global configuration includes various parameters that let you tweak +the HTML representation of the |TableReport|. + +For performance reasons, the |TableReport| disables the computation of +distributions and associations for tables with more than 30 columns. +This behavior can be overridden by setting the parameters ``plot_distributions`` +and ``compute_associations`` to ``True`` respectively. + +It is also possible to specify the floating point precision by setting the appropriate +``float_precision`` parameter. + +The column threshold that is used by the |TableReport| can be modified in a given +script by using |set_config| and changing the values of +``table_report_plot_threshold`` and ``table_report_associations_threshold`` to +the desired threshold. Environment variables are also provided to set the threshold +permanently. Refer to :ref:`user_guide_configuration_parameters` for more detail. diff --git a/skrub/_docs/guides/table_report/02_exporting.rst b/skrub/_docs/guides/table_report/02_exporting.rst new file mode 100644 index 000000000..4805d2cf5 --- /dev/null +++ b/skrub/_docs/guides/table_report/02_exporting.rst @@ -0,0 +1,61 @@ +.. |TableReport| replace:: :class:`~skrub.TableReport` +.. |set_config| replace:: :func:`~skrub.set_config` +.. |column_associations| replace:: :func:`~skrub.column_associations` + +.. _user_guide_table_report_sharing: +How to export and share the |TableReport| +----------------------------------------- + +The |TableReport| is generated as a standalone HTML file that includes the report +data, the plots, and the Javascript necessary to provide interactivity. + +If it is generated inside a notebook (Jupyter or Marimo), the |TableReport| is +rendered directly inside the cell where it is called. If, instead, it is generated +by a script, the report will need to be opened by calling ``.open()``: + +>>> TableReport(df).open() # doctest: +SKIP + +Note that calling ``.open()`` will start a standalone process that hosts the report, +and a tab will be opened in the default browser. It is not possible to save the +report from the webpage. The function :func:`~skrub.TableReport.write_html` should +be used for that: + +.. code-block:: + + tr = TableReport(df) + tr.write_html("my_report.html") + +It is also possible to export the raw HTML, or a HTML fragment to embed in a page +with :func:`~skrub.TableReport.html` and :func:`~skrub.TableReport.html_snippet` +respectively. + +The report can be exported in JSON format, which allows structured +access to the data and statistics used to build the report with +:func:`~skrub.TableReport.json`. + +.. code-block:: + + tr = TableReport(df) + json_data = tr.json() + +Note that this will export all parts of the |TableReport|, including the distribution +plots in SVG format if they have been generated. If you do not need them, plots should be +disabled directly when generating the table report. + +.. code-block:: + + tr = TableReport(df, plot_distributions=False) + json_data = tr.json() + +Finally, :func:`~skrub.TableReport.markdown` produces a shortened summary of the +report in Markdown format. This summary contains the measured statistics and the +associations (if measured): plots and table preview are skipped from this view. +This format can be shared easily in text form, or fed to an AI agent to obtain +insight about a given table. + +.. warning:: + + No sanitization of the input data is performed, and the report includes raw data + (column names and cell values). Therefore, it should not be used on untrusted data, + or when the resulting summary may be too large as it could lead to security risks + or performance problems. diff --git a/skrub/_docs/guides/table_report/03_finding_correlated_columns.rst b/skrub/_docs/guides/table_report/03_finding_correlated_columns.rst new file mode 100644 index 000000000..746ab5ce4 --- /dev/null +++ b/skrub/_docs/guides/table_report/03_finding_correlated_columns.rst @@ -0,0 +1,38 @@ +.. |TableReport| replace:: :class:`~skrub.TableReport` +.. |DropSimilar| replace:: :class:`~skrub.DropSimilar` +.. |column_associations| replace:: :func:`~skrub.column_associations` + +.. _user_guide_table_report_associations: + +How to find correlated columns in a dataframe +============================================ + +In addition to |TableReport|'s **Associations** tab, you can compute associations +using the |column_associations| function, which returns a dataframe containing the +associations. + +Reported metrics include `Cramer’s V statistic `_ +and `Pearson’s Correlation Coefficient `_. +The result is returned as a dataframe that contains the column name and idx for the +left and the right table, and both associations; results are sorted in descending order +by Cramer’s V association. + +This can be useful to have access to the information used in the |TableReport| +for later use (e.g., to select which columns to drop). These associations are +also used by the |DropSimilar| transformer to select which columns should be dropped. + +.. code-block:: + + from skrub import column_associations + from skrub.datasets import fetch_employee_salaries + import pandas as pd + path = fetch_employee_salaries().path + df = pd.read_csv(path) + column_associations(df).head() + + left_column_name left_column_idx right_column_name right_column_idx cramer_v pearson_corr + 0 department 1 department_name 2 1.000000 NaN + 1 assignment_category 4 current_annual_salary 8 0.635525 NaN + 2 division 3 assignment_category 4 0.601097 NaN + 3 assignment_category 4 employee_position_title 5 0.496814 NaN + 4 division 3 employee_position_title 5 0.416034 NaN diff --git a/skrub/_docs/guides/table_report/04_custom_filters.rst b/skrub/_docs/guides/table_report/04_custom_filters.rst new file mode 100644 index 000000000..807e963a0 --- /dev/null +++ b/skrub/_docs/guides/table_report/04_custom_filters.rst @@ -0,0 +1,32 @@ +.. |TableReport| replace:: :class:`~skrub.TableReport` + + +How to define custom filters for the TableReport +================================================ + +It is possible to define custom filters for the |TableReport| using either column +names, or :ref:`skrub selectors `. + +By defining a custom filter, it becomes easier to show and work directly on a given +subset of columns. + +For example, we might want to select only the columns whose name follows a certain +pattern (here, starting with "metric"): + +>>> import pandas as pd +>>> from skrub import TableReport +>>> from skrub import selectors as s +>>> df = pd.DataFrame( +... {"id": [1, 2, 3], "metric1": [1, 2, 3], "metric2": [4, 5, 6], "metric3": [7, 8, 9]} +... ) + +Custom filters should be defined as a dictionary where the key is the name of the +filter that should be displayed in the generated report, and the value is either +a list of columns, the indices of the columns (first column has index 0 etc.), or +a skrub selector, as shown in this example: + +>>> filters = {"only_metrics": s.glob("metric*")} +>>> report = TableReport(df, column_filters=filters) + +Custom filters are placed at the top of the list of filters, in the "Filter columns" +drop-down menu. diff --git a/skrub/_docs/guides/utilities/customizing_configuration.rst b/skrub/_docs/guides/utilities/customizing_configuration.rst new file mode 100644 index 000000000..eb21bffa7 --- /dev/null +++ b/skrub/_docs/guides/utilities/customizing_configuration.rst @@ -0,0 +1,93 @@ +.. |set_config| replace:: :func:`~skrub.set_config` +.. |get_config| replace:: :func:`~skrub.get_config` +.. |config_context| replace:: :func:`~skrub.config_context` + +.. _user_guide_configuration_parameters: + +How to configure and customize the default behavior of skrub +============================================================ + + +Skrub includes a configuration manager that allows setting various parameters +(see the |set_config| documentation for more detail). + +It is possible to change configuration options using the |set_config| function: + +>>> from skrub import set_config +>>> set_config(table_report_verbosity=0) # doctest: +SKIP + +This alters the behavior of skrub in the current script. Each configuration parameter +has an environment variable that can be used to set it permanently. + +Additionally, a |config_context| is provided to allow temporarily altering the +configuration: + +>>> import skrub +>>> with skrub.config_context(table_report_plots_threshold=1): +... pass + +Within this context, only the code executed inside the ``with`` statement is affected. + +The |get_config| function allows to retrieve the current configuration. + +Configuration parameters +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The configuration parameters that can be set with ``set_config`` and ``config_context`` +are available by using + +>>> import skrub +>>> config = skrub.get_config() +>>> config.keys() +dict_keys(['use_table_report_data_ops', 'table_report_plots_threshold', 'table_report_associations_threshold', 'table_report_verbosity', 'subsampling_seed', 'enable_subsampling', 'float_precision', 'cardinality_threshold', 'data_dir', 'eager_data_ops', 'data_ops_open_graph_dropdown']) + +These are the parameters currently available in the global configuration: + +.. list-table:: Skrub Configuration Parameters + :header-rows: 1 + :widths: 20 15 25 40 + + * - Parameter Name + - Default Value + - Env Variable + - Description + * - ``use_table_report_data_ops`` + - ``True`` + - ``SKB_USE_TABLE_REPORT_DATA_OPS`` + - Set the HTML representation used for the Data Ops previews. If ``True``, use the :class:`~skrub.TableReport`, otherwise use the default Pandas or Polars representation. + * - ``table_report_verbosity`` + - ``1`` + - ``SKB_TABLE_REPORT_VERBOSITY`` + - Set the verbosity of the :class:`~skrub.TableReport`. If ``1``, print on screen the progress by column, if ``0`` print nothing. + * - ``table_report_plots_threshold`` + - 30 + - ``SKB_TABLE_REPORT_PLOTS_THRESHOLD`` + - If a dataframe has more columns than the value set here, the :class:`~skrub.TableReport` will skip generating the distribution plots (when ``plot_distributions="auto"``, the default). + * - ``table_report_associations_threshold`` + - 30 + - ``SKB_TABLE_REPORT_ASSOCIATIONS_THRESHOLD`` + - If a dataframe has more columns than the value set here, the :class:`~skrub.TableReport` will skip computing the associations (when ``compute_associations="auto"``, the default). + * - ``subsampling_seed`` + - 0 + - ``SKB_SUBSAMPLING_SEED`` + - Set the random seed of subsampling in :func:`skrub.DataOp.skb.subsample()`, when ``how="random"`` is passed. + * - ``enable_subsampling`` + - ``"default"`` + - ``SKB_ENABLE_SUBSAMPLING`` + - Control the activation of subsampling in :func:`skrub.DataOp.skb.subsample()`. If ``"default"``, the behavior of :func:`skrub.DataOp.skb.subsample()` is used. If ``"disable"``, subsampling is never used, so skb.subsample becomes a no-op. If ``"force"``, subsampling is used in all DataOps evaluation modes (eval(), fit_transform, etc.). + * - ``float_precision`` + - 3 + - ``SKB_FLOAT_PRECISION`` + - Control the number of significant digits shown when formatting floats. Applies overall precision rather than fixed decimal places. + * - ``cardinality_threshold`` + - 40 + - ``SKB_CARDINALITY_THRESHOLD`` + - Set the ``cardinality_threshold`` argument of :class:`~skrub.TableVectorizer`. Additionally, set the threshold for warning the user about high cardinality features in the :class:`~skrub.TableReport`. + * - ``data_dir`` + - ``~/skrub_data`` + - ``SKB_DATA_DIRECTORY`` + - Set the default location used by skrub to store datasets and other data, such as the Data Ops reports. + * - ``eager_data_ops`` + - ``True`` + - ``SKB_EAGER_DATA_OPS`` + - Eagerly perform checks on the DataOps as soon they are created, and compute previews if preview data is available. If disabled, those checks are delayed until the DataOp is actually used diff --git a/skrub/_docs/guides/utilities/deduplicate_categorical_data.rst b/skrub/_docs/guides/utilities/deduplicate_categorical_data.rst new file mode 100644 index 000000000..c9dc31491 --- /dev/null +++ b/skrub/_docs/guides/utilities/deduplicate_categorical_data.rst @@ -0,0 +1,112 @@ +.. |deduplicate| replace:: :func:`~skrub.deduplicate` + +.. _user_guide_deduplicate: + +How to deduplicate categorical data with |deduplicate| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you have a series or list that contains strings with typos, the |deduplicate| +function may be used to remove the typos. This is done by creating a mapping +between the typo strings and the correct strings. + +.. admonition:: How does this work? + :collapsible: closed + + Deduplication is done by first computing the n-gram distance between unique + categories in data, then performing hierarchical clustering on this distance + matrix, and choosing the most frequent element in each cluster as the + 'correct' spelling. This method works best if the true number of + categories is significantly smaller than the number of observed spellings. + +>>> from skrub.datasets import make_deduplication_data +>>> duplicated = make_deduplication_data(examples=['black', 'white'], +... entries_per_example=[5, 5], +... prob_mistake_per_letter=0.3, +... random_state=42) +>>> duplicated # doctest: +SKIP +['blacs', 'black', 'black', 'black', 'black', \ +'uhibe', 'white', 'white', 'white', 'white'] + +To deduplicate the data, we can build a correspondence matrix: + +>>> from skrub import deduplicate +>>> deduplicate_correspondence = deduplicate(duplicated) +>>> deduplicate_correspondence +blacs black +black black +black black +black black +black black +uhibe white +white white +white white +white white +white white +dtype: ... + +>>> deduplicated = list(deduplicate_correspondence) +>>> deduplicated # doctest: +SKIP +['black', 'black', 'black', 'black', 'black', \ +'white', 'white', 'white', 'white', 'white'] + +See the |deduplicate| documentation for caveats and more detail. + +Deduplicating values in a dataframe +----------------------------------- + +|deduplicate| can be used to replace values in a dataframe that contains typos. +This can be done with ``deduplicate_correspondence`` computed above and the +``map`` function in pandas, or the ``replace`` function in polars. + +>>> import pandas as pd +>>> df = pd.DataFrame({'color': duplicated, 'value': range(10)}) +>>> df +color value +0 blacs 0 +1 black 1 +2 black 2 +3 black 3 +4 black 4 +5 uhibe 5 +6 white 6 +7 white 7 +8 white 8 +9 white 9 +>>> df['deduplicated_color'] = df['color'].map(deduplicate_correspondence.to_dict()) +>>> df +color value deduplicated_color +0 blacs 0 black +1 black 1 black +2 black 2 black +3 black 3 black +4 black 4 black +5 uhibe 5 white +6 white 6 white +7 white 7 white +8 white 8 white +9 white 9 white + +With polars: + +>>> import polars as pl # doctest: +SKIP +>>> df = pl.DataFrame({'color': duplicated, 'value': range(10)}) # doctest: +SKIP +>>> df.with_columns(deduplicated_color = pl.col("color").replace( # doctest: +SKIP +... deduplicate_correspondence.to_dict()) +... ) +shape: (10, 3) +┌───────┬───────┬────────────────────┐ +│ color ┆ value ┆ deduplicated_color │ +│ --- ┆ --- ┆ --- │ +│ str ┆ i64 ┆ str │ +╞═══════╪═══════╪════════════════════╡ +│ blacs ┆ 0 ┆ black │ +│ black ┆ 1 ┆ black │ +│ black ┆ 2 ┆ black │ +│ black ┆ 3 ┆ black │ +│ black ┆ 4 ┆ black │ +│ uhibe ┆ 5 ┆ white │ +│ white ┆ 6 ┆ white │ +│ white ┆ 7 ┆ white │ +│ white ┆ 8 ┆ white │ +│ white ┆ 9 ┆ white │ +└───────┴───────┴────────────────────┘ diff --git a/skrub/_docs/guides/utilities/fetching_datasets.rst b/skrub/_docs/guides/utilities/fetching_datasets.rst new file mode 100644 index 000000000..07db3844a --- /dev/null +++ b/skrub/_docs/guides/utilities/fetching_datasets.rst @@ -0,0 +1,46 @@ +Working with the example datasets provided by skrub +------------------------------------------------------- + +Skrub includes a number of datasets used for running examples. Each dataset +can be downloaded using its ``fetch_*`` function, provided in the ``skrub.datasets`` +namespace: + +.. code-block:: python + + from skrub.datasets import fetch_employee_salaries + data = fetch_employee_salaries() + +Datasets are stored as :class:`~sklearn.utils.Bunch` objects, which include a path +to each table in the dataset. Datasets should be loaded using the path: + +.. code-block:: python + + import pandas as pd + df = pd.read_csv(data.path) + + +Some datasets include multiple tables: in this case, ``path`` isn't available and +instead each table should be loaded with its own path: + + +.. code-block:: python + + from skrub.datasets import fetch_credit_fraud + data = fetch_employee_salaries() + baskets = pd.read_csv(data.baskets_path) + products = pd.read_csv(data.products_path) + + +Modifying the download location of ``skrub`` datasets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, datasets are stored in ``~/skrub_data``, where ``~`` is expanded as +the (OS dependent) home directory of the user. The function +:func:`~skrub.datasets.get_data_dir` shows +the location that ``skrub`` uses to store data. + +If needed, it is possible to change this location by modifying the environment +variable ``SKB_DATA_DIRECTORY`` to an **absolute directory path**. + +See :ref:`user_guide_configuration_parameters` for more info on the global skrub +configuration. diff --git a/skrub/_docs/howto.rst b/skrub/_docs/howto.rst new file mode 100644 index 000000000..994e40380 --- /dev/null +++ b/skrub/_docs/howto.rst @@ -0,0 +1,26 @@ +.. _how_to: + +How-tos +-------- + +This page is the index of skrub's How-to guides: these are short guides and examples +on how to complete specific tasks and address specific circumstances. + +For a more long-form discussion on how skrub works and the reasoning behind specific +design choices, refer to the :ref:`User Guide `. For runnable code, see the +:doc:`Example gallery `. For class and function details, see +the :ref:`API Reference `. + + +.. include:: includes/big_toc_css.rst + +.. toctree:: + :maxdepth: 2 + + guides/table_report/01_alter_appearance.rst + guides/table_report/02_exporting.rst + guides/table_report/03_finding_correlated_columns.rst + guides/table_report/04_custom_filters.rst + guides/utilities/customizing_configuration.rst + guides/utilities/deduplicate_categorical_data.rst + guides/utilities/fetching_datasets.rst diff --git a/skrub/_docs/index.rst b/skrub/_docs/index.rst new file mode 100644 index 000000000..18d745435 --- /dev/null +++ b/skrub/_docs/index.rst @@ -0,0 +1,18 @@ +.. title:: Home + +.. toctree:: + :maxdepth: 2 + +.. currentmodule:: skrub + +.. toctree:: + :hidden: + + install + documentation + howto + reference/index + auto_examples/index + learning_materials + CHANGES + development diff --git a/skrub/_docs/install.rst b/skrub/_docs/install.rst new file mode 100644 index 000000000..3f02d7041 --- /dev/null +++ b/skrub/_docs/install.rst @@ -0,0 +1,238 @@ +.. _installation_instructions: + +.. currentmodule:: skrub + +======= +Install +======= + +.. raw:: html + +
+ + + +
+
+
+ +.. code:: console + + pip install skrub -U + +| + +**Deep learning dependencies** + +Deep-learning based encoders like :class:`TextEncoder` require installing optional +dependencies to use them. The following will install +`torch `_, +`transformers `_, +and `sentence-transformers `_. + +.. code:: console + + $ pip install skrub[transformers] -U + + +.. raw:: html + +
+
+
+ +.. code:: console + + conda install -c conda-forge skrub + +| + +**Deep learning dependencies** + +Deep-learning based encoders like :class:`TextEncoder` require installing optional +dependencies to use them. The following will install +`torch `_, +`transformers `_, +and `sentence-transformers `_. + +.. code:: console + + $ conda install -c conda-forge skrub[transformers] + + +.. raw:: html + +
+
+
+ +.. code:: console + + mamba install -c conda-forge skrub + +| + +**Deep learning dependencies** + +Deep-learning based encoders like :class:`TextEncoder` require installing optional +dependencies to use them. The following will install +`torch `_, +`transformers `_, +and `sentence-transformers `_. + +.. code:: console + + $ mamba install -c conda-forge skrub[transformers] + + +.. raw:: html + +
+
+
+ +.. _installing_from_source: + +Advanced Usage for Contributors +------------------------------- + +1. Fork the project +''''''''''''''''''' + +To contribute to the project, you first need to +`fork skrub on GitHub `_. + +That will enable you to push your commits to a branch *on your fork*. + +2. Clone your fork +'''''''''''''''''' + +Clone your forked repo to your local machine: + +.. code:: console + + git clone https://github.com//skrub + cd skrub + +Next, add the *upstream* remote (i.e. the official skrub repository). This allows you +to pull the latest changes from the main repository: + +.. code:: console + + git remote add upstream https://github.com/skrub-data/skrub.git + +Verify that both the origin (your fork) and upstream (official repo) +are correctly set up: + +.. code:: console + + git remote -v + +You should see something like this: + +.. code:: console + + origin git@github.com:/skrub.git (fetch) + origin git@github.com:/skrub.git (push) + upstream git@github.com:skrub-data/skrub.git (fetch) + upstream git@github.com:skrub-data/skrub.git (push) + + +3. Setup your environment +''''''''''''''''''''''''' + +Now, setup a development environment. +You can set up a virtual environment with Conda, or with python's ``venv``: + +- With `conda `__: + +.. code:: console + + conda create -n env_skrub python=3.13 + conda activate env_skrub + +- With `venv `__: +.. code:: console + + python -m venv env_skrub + source env_skrub/bin/activate + +Then, with the environment activated and at the root of your local copy of skrub, +install the local package in editable mode with development dependencies: + +.. code:: console + + pip install -e ".[dev]" + +Enabling pre-commit hooks ensures code style consistency by triggering checks (mainly formatting) every time you run a ``git commit``. + +.. code:: console + + pre-commit install + + +Optionally, configure Git to ignore certain revisions in git blame and +IDE integrations. These revisions are listed in .git-blame-ignore-revs: + +.. code:: console + + git config blame.ignoreRevsFile .git-blame-ignore-revs + +4. Run the tests +'''''''''''''''' + +To ensure your environment is correctly set up, run the test suite: + +.. code:: console + + pytest --pyargs skrub + +Testing should take about 5 minutes. + +If you see some warnings like: + +.. code:: sh + + UserWarning: Only pandas and polars DataFrames are supported, but input is a Numpy array. Please convert Numpy arrays to DataFrames before passing them to skrub transformers. Converting to pandas DataFrame with columns ['0', '1', …]. + warnings.warn( + +This is expected, and you may proceed with the next steps without worrying about them. +However, no tests should fail at this point: if they do fail, then let us know. + +After that, your environment is ready for development! + +**Deep learning dependencies** + +Deep-learning based encoders like :class:`TextEncoder` require installing optional +dependencies to use them. The following will install +`torch `_, +`transformers `_, +and `sentence-transformers `_. + +.. code:: console + + $ pip install -e ".[transformers]" + + +Now that you're set up, +you may return to :ref:`writing your first pull request` +and start coding! + +.. raw:: html + +
+
+
diff --git a/skrub/_docs/joining_dataframes.rst b/skrub/_docs/joining_dataframes.rst new file mode 100644 index 000000000..074421399 --- /dev/null +++ b/skrub/_docs/joining_dataframes.rst @@ -0,0 +1,11 @@ +.. _user_guide_joining_dataframes: + +Joining Dataframes +================== + +This section covers the various methods provided by skrub to join dataframes. + +.. toctree:: + :maxdepth: 3 + + modules/joining_tables/assembling diff --git a/skrub/_docs/learning_materials.rst b/skrub/_docs/learning_materials.rst new file mode 100644 index 000000000..0ed79113f --- /dev/null +++ b/skrub/_docs/learning_materials.rst @@ -0,0 +1,11 @@ +Learning Materials +================== + +You are being redirected to the new learning materials page. + +.. raw:: html + + + +If you are not redirected automatically, follow this +`link `_. diff --git a/skrub/_docs/modules/column_level_featurizing/advanced_columnwise_operations.rst b/skrub/_docs/modules/column_level_featurizing/advanced_columnwise_operations.rst new file mode 100644 index 000000000..89fcba564 --- /dev/null +++ b/skrub/_docs/modules/column_level_featurizing/advanced_columnwise_operations.rst @@ -0,0 +1,133 @@ +.. currentmodule:: skrub + +.. |ApplyToCols| replace:: :class:`ApplyToCols` +.. |RejectColumn| replace:: :class:`core.RejectColumn` +.. |SingleColumnTranformer| replace:: :class:`core.SingleColumnTranformer` +.. |ToDatetime| replace:: :class:`ToDatetime` + +.. _user_guide_single_column_transformer: + +Advanced columnwise operations +------------------------------ + +.. _single_column_transformer: + +The single column transformer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In cases where we want to apply a custom transformation to a series we need the |ApplyToCols| +structure to handle multiple columns, and if this transformation needs to be able to reject certain +columns and communicate this to |ApplyToCols|, we must to create a transformer from scratch +that raises this exception when appropriate: this can be done with the |SingleColumnTranformer| class. + +For instance, we might want to create a custom transformer specialized in parsing zip codes: +in this example, the zip codes need to have the format ``AB123``, that is two letters +followed by three digits. + +>>> import pandas as pd +>>> df = pd.DataFrame({'sent': ["AB123", "BD601", "HS014"], 'received': ["AB1C45", "DU3K93", "WB9M88"]}) +>>> df + sent received +0 AB123 AB1C45 +1 BD601 DU3K93 +2 HS014 WB9M88 + +We would like to be able to "unpack" the zip code so that we have a column for the +letters and one for the digits; the transformer should also be able to "reject" a column +if it does not satisfy the format we specify. A "rejected" column should be passed +through unchanged, as it cannot be handled by this particular transformer. + +We can therefore define a custom class that inherits from |SingleColumnTranformer| +and that raises |RejectColumn| if a column cannot be handled: + +>>> from skrub.core import RejectColumn, SingleColumnTransformer +>>> class ZipcodeParser(SingleColumnTransformer): +... def __init__(self): +... return +... def fit_transform(self, X, y=None): +... if any(X.map(len) != 5): +... raise RejectColumn('This transformer only takes zip codes of length 5.') +... else: +... letters = X.map(lambda s: s[:2]) +... try: +... numbers = X.map(lambda s: int(s[2:])) +... except: +... raise RejectColumn('Input zip codes must consist of two letters followed by three numbers.') +... return(pd.DataFrame({'letters': letters, 'numbers': numbers})) +>>> ZipcodeParser().fit_transform(df["sent"]) + letters numbers +0 AB 123 +1 BD 601 +2 HS 14 + +We can use |ApplyToCols| to apply this transformer to the entire dataframe at once, +and set ``allow_reject=True`` to let rejected columns through without changes: + +>>> from skrub import ApplyToCols +>>> ApplyToCols(ZipcodeParser(), allow_reject=True).fit_transform(df) +letters numbers received +0 AB 123 AB1C45 +1 BD 601 DU3K93 +2 HS 14 WB9M88 + +Note how the ``"received"`` column has been "rejected" and passed through unmodified. + + + +Rejection handling with |ApplyToCols| and |RejectColumn| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The combination |ApplyToCols| and |RejectColumn| allows allows flexible manipulation +and error checking of dataframe. In the previous example, we decided to ignore the +malformed ``"received"`` column by setting ``allow_reject=True``. If, however, +we want our transformer to fail if it encounters a column that it cannot parse, +we can keep the default value of ``allow_reject=False``, so that the transform +fails as soon as a malformed column is encountered: + +>>> ApplyToCols(ZipcodeParser()).fit_transform(df) # doctest: +SKIP +Traceback (most recent call last): + ... +skrub.core.RejectColumn: This transformer only takes zip codes of length 5. +Transformer ZipcodeParser.fit_transform failed on column 'received'. See above for the full traceback. +Letting rejected columns through can be useful for situations in which we do not +know the content of a column in advance, like when we are trying to convert to +datetime columns in a dataframe, without knowing which ones actually contain dates. + +>>> from skrub import ToDatetime +>>> df = pd.DataFrame(dict(birthday=["29/01/2024"], city=["London"])) +>>> df + birthday city +0 29/01/2024 London +>>> df.dtypes +birthday ... +city ... +dtype: object + +Converting a datetime column would work: + +>>> ToDatetime().fit_transform(df["birthday"]) +0 2024-01-29 +Name: birthday, dtype: datetime64[...] + +While non-datetimes would raise |RejectColumn|: + +>>> ToDatetime().fit_transform(df["city"]) +Traceback (most recent call last): + ... +skrub.core.RejectColumn: Could not find a datetime format for column 'city'. + +The ``allow_reject`` parameter in |ApplyToCols| allows to apply the same transformer +to all columns without having to worry about which columns will actually be converted: +here, |ToDatetime| is applied only to the "birthday" column, while "city" is passed +through unchanged and no exception is raised. + +>>> to_datetime = ApplyToCols(ToDatetime(), allow_reject=True) +>>> transformed = to_datetime.fit_transform(df) +>>> transformed + birthday city +0 2024-01-29 London + +We can see that the only column that has a transformer is "birthday": + +>>> to_datetime.transformers_ +{'birthday': ToDatetime()} diff --git a/skrub/_docs/modules/column_level_featurizing/feature_engineering_categorical.rst b/skrub/_docs/modules/column_level_featurizing/feature_engineering_categorical.rst new file mode 100644 index 000000000..2e159f687 --- /dev/null +++ b/skrub/_docs/modules/column_level_featurizing/feature_engineering_categorical.rst @@ -0,0 +1,132 @@ + +.. |StringEncoder| replace:: :class:`~skrub.StringEncoder` +.. |TextEncoder| replace:: :class:`~skrub.TextEncoder` +.. |MinHashEncoder| replace:: :class:`~skrub.MinHashEncoder` +.. |GapEncoder| replace:: :class:`~skrub.GapEncoder` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` +.. |OneHotEncoder| replace:: :class:`~sklearn.preprocessing.OneHotEncoder` +.. |OrdinalEncoder| replace:: :class:`~sklearn.preprocessing.OrdinalEncoder` + +.. _user_guide_feature_engineering_categorical: + +Encoding string and text columns as numeric features +====================================================== + +In skrub, categorical features are features that are not parsed as either numbers +or datetimes. They may have a Categorical datatype, or they may simply be strings. +These features are very common in practice, and there are various strategies that +can be employed to handle them. + +A common approach is to use the |OneHotEncoder| or the |OrdinalEncoder| on +categorical features, but both approaches have limitations. The |OneHotEncoder| +becomes expensive when the number of distinct values becomes large, while the +|OrdinalEncoder| introduces order in features that may not have an inherent ordering. + +To address these shortcomings and generalize to more columns, skrub implements +four different transformers, each with its own pros and cons. + +All encoders work like regular scikit-learn transformers. All encoders +take a parameter ``n_components`` to specify how many features should +be generated for each input feature. + +>>> import pandas as pd +>>> from skrub import StringEncoder + +>>> X = pd.Series([ +... "The professor snatched a good interview out of the jaws of these questions.", +... "Bookmarking this to watch later.", +... "When you don't know the lyrics of the song except the chorus", +... ], name='video comments') + +>>> encoder = StringEncoder(n_components=2) + +The result of the ``.fit_transform`` is a new dataframe that contains as many columns +as the number of components specified (here, 2). +Features generated by each encoder (except the |GapEncoder|) are always named after +the original column name (here, ``"video comments"``), followed by the index of the +resulting feature. + +>>> encoder.fit_transform(X) # doctest: +SKIP + video comments_0 video comments_1 +0 1.322969 -0.163066 +1 0.379689 1.659318 +2 1.306402 -0.317126 + +The |GapEncoder| names the columns after the categories it estimates from the +data, which are built by capturing combinations of substrings that frequently co-occur. +More information on the functioning and the theoretical background of the |GapEncoder| +is available in the documentation of the encoder itself. + +>>> from skrub import GapEncoder +>>> GapEncoder(n_components=2).fit_transform(X) # doctest: +SKIP + video comments: bookmarking, except, lyrics video comments: professor, questions, interview +0 0.000786 1.360704 +1 0.559531 0.000717 +2 0.982307 0.099680 + +Choosing the right encoder for the job +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- |StringEncoder|: **the default encoder, strong in most cases**: A strong and quick + baseline for both short strings with high cardinality and long text. This encoder + computes the n-gram frequency using tf-idf vectorization, followed by truncated SVD + (`Latent Semantic Analysis `_). + This is the default encoder used by the |TableVectorizer| and the |tabular_pipeline|. + +- |TextEncoder|: **language model-based, strong on text but expensive to run**: + This encoder encodes string features using pretrained language models from the + HuggingFace Hub. It is a wrapper around `sentence-transformers `_ + compatible with the scikit-learn API and usable in pipelines. Best for free-flowing + text and when columns include context found in the pretrained model (e.g., names of + cities etc.). Note that this encoder can take a very long time to train, especially + on large datasets and on CPU. The |TextEncoder| has additional dependencies that + are not included in the standard skrub installation. + Refer to :ref:`installation_instructions` for info on how to prepare the + environment. + +- |MinHashEncoder|: **very fast encoder, but not as effective as the others**: + This encoder decomposes strings into n-grams, then applies the MinHash method to + convert them into numeric features. Fast to train, but features usually yield worse + results compared to other methods. + +- |GapEncoder|: **an interpretable, if slower encoder**: The |GapEncoder| estimates + "latent categories" on the training data by finding common n-grams between strings, + then encodes the categories as real numbers. It allows access to grouped features + via ``.get_feature_names_out()``, which allows for better interpretability. This + encoder may require a long time to train. + +.. list-table:: + :header-rows: 1 + :widths: 15 15 25 20 25 + + * - Encoder + - Training time + - Performance on categorical data + - Performance on text data + - Notes + * - |StringEncoder| + - Fast + - Good + - Good + - + * - |TextEncoder| + - Very slow + - Mediocre to good + - Very good + - Requires the ``transformers`` package to be installed + * - |GapEncoder| + - Slow + - Good + - Mediocre to good + - Interpretable + * - |MinHashEncoder| + - Very fast + - Mediocre to good + - Mediocre + - + +:ref:`This example ` and this +`blog post `_ +include a more systematic analysis of each method. +The docstrings of each encoder provide additional details on how they work. diff --git a/skrub/_docs/modules/column_level_featurizing/feature_engineering_datetimes.rst b/skrub/_docs/modules/column_level_featurizing/feature_engineering_datetimes.rst new file mode 100644 index 000000000..4df4642bb --- /dev/null +++ b/skrub/_docs/modules/column_level_featurizing/feature_engineering_datetimes.rst @@ -0,0 +1,276 @@ +.. |ToDatetime| replace:: :class:`~skrub.ToDatetime` +.. |to_datetime| replace:: :func:`~skrub.to_datetime` +.. |DatetimeEncoder| replace:: :class:`~skrub.DatetimeEncoder` + +.. _user_guide_feature_engineering_datetimes: + +Handling datetimes: parsing from strings and encoding as numbers +================================================================ +Depending on the input data, timestamps and dates can cause issues, or require +specific parsing. For example, reading input data stored in ``csv`` format results +in datetime columns that are treated as strings. + +In such cases, parsing columns that contain timestamps or dates so that they are +treated as datetime objects allows to make use of advanced functionalities available +in the standard Python library, Pandas and Polars. + +Skrub provides objects that help with parsing such data (|ToDatetime|), as well +as the |DatetimeEncoder|, a datetime-specific encoder that feature engineers +datetime columns. + + +Parsing Datetime Strings with |ToDatetime| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Skrub provides helpers to parse datetime string columns automatically: + +- The |ToDatetime| transformer learns a mapping between columns and their formats. + It then applies this mapping during the transform step. +- The |to_datetime| function applies the |ToDatetime| transformer to all columns + in the dataframe, and tries to parse them as datetimes. The format can be + inferred or user-specified with the ``format`` argument. + + +>>> import pandas as pd +>>> s = pd.Series(["2024-05-05T13:17:52", None, "2024-05-07T13:17:52"], name="when") +>>> s +0 2024-05-05T13:17:52 +1 ... +2 2024-05-07T13:17:52 +Name: when, dtype: ... + +>>> from skrub import ToDatetime + +>>> to_dt = ToDatetime() +>>> to_dt.fit_transform(s) +0 2024-05-05 13:17:52 +1 NaT +2 2024-05-07 13:17:52 +Name: when, dtype: datetime64[...] + +The attributes ``format_``, ``output_dtype_``, ``output_time_zone_`` +record information about the conversion result. + +>>> to_dt.format_ +'%Y-%m-%dT%H:%M:%S' +>>> to_dt.output_dtype_ +dtype('>> to_dt.output_time_zone_ is None +True + +Once |ToDatetime| was successfully fitted, ``transform`` will always try to +parse datetimes with the same format and output the same ``dtype``. Entries that +fail to be converted result in a null value: + +>>> s = pd.Series(["2024-05-05T13:17:52", None, "2024-05-07T13:17:52"], name="when") +>>> to_dt = ToDatetime().fit(s) +>>> to_dt.transform(s) +0 2024-05-05 13:17:52 +1 NaT +2 2024-05-07 13:17:52 +Name: when, dtype: datetime64[...] +>>> s = pd.Series(["05/05/2024", None, "07/05/2024"], name="when") +>>> to_dt.transform(s) +0 NaT +1 NaT +2 NaT +Name: when, dtype: datetime64[...] + + +Dealing with Time zones +^^^^^^^^^^^^^^^^^^^^^^^ + +During ``fit``, parsing strings that contain fixed offsets results in datetimes +in UTC. Mixed offsets are supported and will all be converted to UTC. + +>>> s = pd.Series(["2020-01-01T04:00:00+02:00", "2020-01-01T04:00:00+03:00"]) +>>> to_dt.fit_transform(s) +0 2020-01-01 02:00:00+00:00 +1 2020-01-01 01:00:00+00:00 +dtype: datetime64[..., UTC] +>>> to_dt.format_ +'%Y-%m-%dT%H:%M:%S%z' +>>> to_dt.output_time_zone_ +'UTC' + +Strings with no timezone indication result in naive datetimes: + +>>> s = pd.Series(["2020-01-01T04:00:00", "2020-01-01T04:00:00"]) +>>> to_dt.fit_transform(s) +0 2020-01-01 04:00:00 +1 2020-01-01 04:00:00 +dtype: datetime64[...] +>>> to_dt.output_time_zone_ is None +True + +During ``transform``, outputs are cast to the same ``dtype`` that was found +during ``fit``. This includes the timezone, which is converted if necessary. + +>>> s_paris = pd.to_datetime( +... pd.Series(["2024-05-07T14:24:49", "2024-05-06T14:24:49"]) +... ).dt.tz_localize("Europe/Paris") +>>> s_paris +0 2024-05-07 14:24:49+02:00 +1 2024-05-06 14:24:49+02:00 +dtype: datetime64[..., Europe/Paris] +>>> to_dt = ToDatetime().fit(s_paris) +>>> to_dt.output_dtype_ +datetime64[..., Europe/Paris] + +Here our converter is set to output datetimes with nanosecond resolution, +localized in "Europe/Paris". + +We may have a column in a different timezone: + +>>> s_london = s_paris.dt.tz_convert("Europe/London") +>>> s_london +0 2024-05-07 13:24:49+01:00 +1 2024-05-06 13:24:49+01:00 +dtype: datetime64[..., Europe/London] + +Here the timezone is "Europe/London" and the times are offset by 1 hour. During +``transform`` datetimes will be converted to the original dtype and the +"Europe/Paris" timezone: + +>>> to_dt.transform(s_london) +0 2024-05-07 14:24:49+02:00 +1 2024-05-06 14:24:49+02:00 +dtype: datetime64[..., Europe/Paris] + +Moreover, we may have to transform a timezone-naive column whereas the +transformer was fitted on a timezone-aware column. Note that this is somewhat a +corner case unlikely to happen in practice if the inputs to ``fit`` and +``transform`` come from the same dataframe. + +In this case, we make the arbitrary choice to assume that the timezone-naive +datetimes are in UTC. + +>>> s_naive = s_paris.dt.tz_convert(None) +>>> to_dt.transform(s_naive) +0 2024-05-07 14:24:49+02:00 +1 2024-05-06 14:24:49+02:00 +dtype: datetime64[..., Europe/Paris] + +Conversely, a transformer fitted on a timezone-naive column can convert +timezone-aware columns. Here also, we assume the naive datetimes were in UTC. + +>>> to_dt = ToDatetime().fit(s_naive) +>>> to_dt.transform(s_london) +0 2024-05-07 12:24:49 +1 2024-05-06 12:24:49 +dtype: datetime64[...] + +Caveats when dealing with month first/day first conventions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +When parsing strings in one of the formats above, |ToDatetime| tries to guess +if the month comes first (USA convention) or the day (rest of the world) from +the data. + +>>> s = pd.Series(["05/23/2024"]) +>>> to_dt.fit_transform(s) +0 2024-05-23 +dtype: datetime64[...] +>>> to_dt.format_ +'%m/%d/%Y' + +Here we could infer ``'%m/%d/%Y'`` because there is no 23rd month in a year. +Similarly, + +>>> s = pd.Series(["23/05/2024"]) +>>> to_dt.fit_transform(s) +0 2024-05-23 +dtype: datetime64[...] +>>> to_dt.format_ +'%d/%m/%Y' + +In the case where it cannot be inferred, the USA convention is used: + +>>> s = pd.Series(["03/05/2024"]) +>>> to_dt.fit_transform(s) +0 2024-03-05 +dtype: datetime64[...] +>>> to_dt.format_ +'%m/%d/%Y' + +.. _user_guide_datetime_encoder: + +Encoding and Feature Engineering with |DatetimeEncoder| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once datetime columns have been parsed, they can be encoded as numeric features with +the |DatetimeEncoder|, by extracting temporal features (year, month, day, +hour, etc.). No timezone conversion is done; the timezone +in the feature is retained. The |DatetimeEncoder| rejects non-datetime columns, +so it should only be applied after conversion using |ToDatetime|. +If the input column is timezone aware, the extracted features will be in the column's +timezone; this is normally the case when the datetime column has been encoded with |ToDatetime|. + +>>> import pandas as pd +>>> login = pd.to_datetime( +... pd.Series( +... ["2024-05-13T12:05:36", None, "2024-05-15T13:46:02"], name="login") +... ) +>>> login +0 2024-05-13 12:05:36 +1 NaT +2 2024-05-15 13:46:02 +Name: login, dtype: datetime64[...] +>>> from skrub import DatetimeEncoder + +>>> DatetimeEncoder().fit_transform(login) +login_year login_month login_day login_hour login_total_seconds +0 2024.0 5.0 13.0 12.0 1.715602e+09 +1 NaN NaN NaN NaN NaN +2 2024.0 5.0 15.0 13.0 1.715781e+09 + +Additionally, the |DatetimeEncoder| can include the following features: + +- Number of seconds from epoch (``add_total_seconds``, ``True`` by default) +- Day of the week (``add_weekday``) +- Day of the year (``add_day_of_year``) + +Periodic encoding is supported through trigonometric (circular) and spline +encoding: set the ``periodic_encoding`` parameter to ``circular`` or ``spline``. + +.. figure:: /_static/periodic_features.png + :alt: Periodic encoding of datetime features + :align: center + :width: 70% + + Example of periodic encoding of datetime features using circular and spline methods. + +Note that if ``periodic_encoding`` is set, the respective features are removed +to reduce redundancy: + +>>> encoder = DatetimeEncoder() +>>> encoder.fit_transform(login).columns +Index(['login_year', 'login_month', 'login_day', 'login_hour', + 'login_total_seconds'], + dtype=...) +>>> from sklearn.pipeline import make_pipeline +>>> encoder = make_pipeline(ToDatetime(), DatetimeEncoder(periodic_encoding="circular")) +>>> encoder.fit_transform(login).columns +Index(['login_year', 'login_total_seconds', 'login_month_circular_0', + 'login_month_circular_1', 'login_day_circular_0', + 'login_day_circular_1', 'login_hour_circular_0', + 'login_hour_circular_1'], + dtype=...) + + +The |DatetimeEncoder| uses hardcoded values for generating periodic features. +The period of each feature is: + +- ``month``: 12 (month in year) +- ``day``: 30 (day in month) +- ``hour``: 24 (hour in day) +- ``weekday``: 7 (day in week) + +Additionally, we specify the number of splines for each feature to avoid +generating too many features: + +- ``month``: 12 +- ``day``: 4 +- ``hour``: 12 +- ``weekday``: 7 + +All extracted features are provided as ``float32`` columns. diff --git a/skrub/_docs/modules/column_level_featurizing/feature_engineering_numerical.rst b/skrub/_docs/modules/column_level_featurizing/feature_engineering_numerical.rst new file mode 100644 index 000000000..e848d073a --- /dev/null +++ b/skrub/_docs/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -0,0 +1,106 @@ +.. |SquashingScaler| replace:: :class:`~skrub.SquashingScaler` +.. |ToFloat| replace:: :class:`~skrub.ToFloat` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` +.. |RobustScaler| replace:: :class:`~sklearn.preprocessing.RobustScaler` +.. |RejectColumn| replace:: :class:`~skrub.core.RejectColumn` + +.. _user_guide_feature_engineering_numeric_to_float: + +Parsing and scaling numeric features +==================================== + +Converting heterogeneous numeric values to uniform float32 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many tabular datasets stored as csv files contain numeric information stored as +strings, mixed representations, locale-specific formats, or other non-standard +encodings. +Common issues include: + +- Thousands separators (``1,234.56`` or ``1 234,56``) +- Use of apostrophes as separators (``4'567.89``) +- Negative numbers encoded inside parentheses (``(1,234.56)``) +- String columns that contain mostly numeric values, but with occasional invalid entries + +To provide consistent numeric behavior, skrub includes the |ToFloat| transformer, +which standardizes all numeric-like columns to ``float32`` and handles a wide +range of real-world formatting issues automatically. Columns that cannot be parsed +are rejected with a |RejectColumn| exception. + +Converting numbers to ``float32`` has the advantage of reducing memory pressure, +while retaining most of the information for training models. + +>>> import pandas as pd +>>> from skrub import ToFloat +>>> s = pd.Series(['1.1', None, '3.3'], name='x') +>>> to_float = ToFloat() +>>> to_float.fit_transform(s) +0 1.1 +1 NaN +2 3.3 +Name: x, dtype: float32 + +If the transformer is fitted correctly, invalid values encountered at transform +time are replaced by ``NaN``: + +>>> to_float.transform(pd.Series(['3.3', 'invalid'], name='x')) +0 3.3 +1 NaN +Name: x, dtype: float32 + +Locale-dependent decimal separators can be handled by specifying the +``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and +a space as thousands separators: + +>>> s = pd.Series(["4 567,89", "12 567,89"], name="x") +>>> ToFloat(decimal=",", thousand=" ").fit_transform(s) +0 4567.8... +1 12567.8... +Name: x, dtype: float32 + +In some contexts, negative numbers may be represented with parentheses, instead of +using ``-``. This case is handled by the ``parentheses`` boolean parameter: + +>>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg") +>>> ToFloat(thousand=",", parentheses=True).fit_transform(s) +0 -1234.5... +1 -1234.5... +Name: neg, dtype: float32 + + +.. _user_guide_squashing_scaler: + +Robust scaling of numeric features using |SquashingScaler| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The |SquashingScaler| is a robust scaler for numeric features, particularly +useful when features include outliers (such as infinite values); missing values +are left unchanged (they are not interpolated). +The |SquashingScaler| centers and scales the data in such a way that outliers are +less likely to skew the final result compared to alternative methods. + +Based on the specified ``quantile_range`` parameter, the scaler employs a scikit-learn +|RobustScaler| to rescale the values in a way that the quantile range occupies +interval of length two, centering the median to zero. It therefore ensures that +inliers are spread to a reasonable range. Afterwards, it uses a smooth clipping +function to ensure all values (including outliers and infinite values) are in the +range ``[-max_absolute_value, max_absolute_value]``. By default, +``max_absolute_value=3``. + +>>> import pandas as pd +>>> import numpy as np +>>> from skrub import SquashingScaler + +>>> X = pd.DataFrame(dict(col=[np.inf, -np.inf, 3, -1, np.nan, 2])) +>>> SquashingScaler(max_absolute_value=3).fit_transform(X) +array([[ 3. ], + [-3. ], + [ 0.49319696], + [-1.34164079], + [ nan], + [ 0. ]]) + +More information about the theory behind the scaler is available in the +|SquashingScaler| documentation, while this +:ref:`working example ` compares +different scalers when used on data that include outliers. diff --git a/skrub/_docs/modules/data_ops/basics/building_data_ops_plan.rst b/skrub/_docs/modules/data_ops/basics/building_data_ops_plan.rst new file mode 100644 index 000000000..00e42f316 --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/building_data_ops_plan.rst @@ -0,0 +1,94 @@ +.. currentmodule:: skrub + +.. _user_guide_data_ops_plan: + +Building a simple DataOps plan +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's build a simple DataOps plan that adds two variables together. + +We start by declaring the variables: + +>>> import skrub + +>>> a = skrub.var("a") +>>> b = skrub.var("b") + +We then apply transformations (in this case, an addition) composing more complex DataOps. + +>>> c = a + b +>>> c + + +Finally, we can evaluate the plan by passing the **environment** in which the +plan should be evaluated. The environment is a dictionary that maps variable names +to their values. + +>>> c.skb.eval({"a": 10, "b": 6}) +16 + +As shown above, the special ``.skb`` attribute allows to interact with the DataOp +object itself, and :meth:`.skb.eval() ` evaluates the DataOp plan. +By default, :meth:`.skb.eval() ` uses the values passed in the +variable definitions, but it can also take an explicit environment +dictionary as an argument. + + +Finally, we can export the plan as a ``Learner`` that can be fitted and applied to +new data: + +>>> learner = c.skb.make_learner() +>>> learner.fit_transform({"a": 10, "b": 7}) +17 + +When using Data Ops, it is important to ensure that all operations are being tracked +by acting on the Data Ops, rather than (for example) the starting dataframe. +Consider the following example: + +>>> import pandas as pd +>>> df = pd.DataFrame({"col": [1, 2, 3]}) +>>> df + col +0 1 +1 2 +2 3 +>>> df_do = skrub.var("df", df) +>>> df_do + +Result: +――――――― + col +0 1 +1 2 +2 3 + +``df_do`` is a Data Op that wraps ``df``, so its preview shows the content of ``df``. +Then, if we now modify ``df_do`` by doubling the column, we can see that both steps +(the creation of the variable, and the doubling) are now tracked by the final +Data Op. + +>>> df_doubled = df_do.assign(col=df_do["col"]*2) +>>> df_doubled + +Result: +――――――― + col +0 2 +1 4 +2 6 +>>> print(df_doubled.skb.describe_steps()) +Var 'df' +( Var 'df' )* +GetItem 'col' +BinOp: mul +CallMethod 'assign' +* Cached, not recomputed + +On the other hand, working directly on ``df`` leads us to the same result, but +the actual operations are not being tracked. +By working only on Data Ops we ensure that all the operations done on the data +are added correctly to the computational graph, which then allows the resulting +learner to execute all steps as intended. + +See :ref:`sphx_glr_auto_tutorials_1110_data_ops_intro.py` for an introductory +example on how to use skrub DataOps on a single dataframe. diff --git a/skrub/_docs/modules/data_ops/basics/control_flow.rst b/skrub/_docs/modules/data_ops/basics/control_flow.rst new file mode 100644 index 000000000..7cd1fc31a --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/control_flow.rst @@ -0,0 +1,176 @@ +.. currentmodule:: skrub + +.. _user_guide_data_ops_control_flow: + +Control flow in DataOps: eager and deferred evaluation +====================================================== + +DataOps represent computations that have not been executed yet, and will +only be triggered when we call :meth:`.skb.eval() `, or when we +create the pipeline with :meth:`.skb.make_learner() ` and +call one of its methods such as ``fit()``. + +This means we cannot use standard Python control flow statements such as ``if``, +``for``, ``with``, etc. with DataOps, because those constructs would execute +immediately. + +>>> import pandas as pd +>>> import skrub +>>> orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "pen", "fork"], +... "price": [1.5, None, 1.5, 2.2], +... "qty": [1, 1, 2, 4], +... } +... ) +>>> orders = skrub.var("orders", orders_df) +>>> for column in orders.columns: +... pass +Traceback (most recent call last): + ... +TypeError: This object is a DataOp that will be evaluated later, when your learner runs. So it is not possible to eagerly iterate over it now. + +We get an error because the ``for`` statement tries to iterate immediately +over the columns. However, ``orders.columns`` is not an actual list of +columns: it is a skrub DataOp that will produce a list of columns, later, +when we run the computation. + +This remains true even if we have provided a value for ``orders`` and we can +see a result for that value: + +>>> orders.columns + +Result: +――――――― +Index(['item', 'price', 'qty'], dtype=...) + +The "result" we see is an *example* result that the computation produces for the +data we provided. But we want to fit our pipeline and apply it to different +datasets, for which it will return a new object every time. So even if we see a +preview of the output on the data we provided, ``orders.columns`` still +represents a future computation that remains to be evaluated. + +Therefore, we must delay the execution of the ``for`` statement until the computation +actually runs and ``orders.columns`` has been evaluated. + +We can achieve this by defining a function that contains the control flow logic +we need, and decorating it with :func:`deferred`. This decorator defers the execution +of the function: when we call it, it does not run immediately. Instead, it returns +a skrub DataOp that wraps the function call. The original function is only +executed when the DataOp is evaluated, and will return the result as a DataOp. + +>>> @skrub.deferred +... def with_upper_columns(df): +... new_columns = [c.upper() for c in df.columns] +... return df.set_axis(new_columns, axis="columns") + +>>> with_upper_columns(orders) + +Result: +――――――― + ITEM PRICE QTY +0 pen 1.5 1 +1 cup NaN 1 +2 pen 1.5 2 +3 fork 2.2 4 + +When the computation runs, ``orders`` will be evaluated first and the result (an +actual dataframe) will be passed as the ``df`` argument to our function. In practice, +the code inside a deferred function is completely equivalent to eager code, so +it is possible to use any Python control flow statement inside it, as well as +act on the data as if it were a regular DataFrame. + +Within a function decorated with :func:`deferred`, objects are evaluated eagerly, +so it is possible to use standard Python control flow statements such as +``if``, ``for``, and it is possible to treat the inputs as if they were +regular objects (e.g., a Pandas DataFrame or Series). + +When the first argument to our function is a skrub DataOp, rather than +applying ``deferred`` and calling the function as shown above we can use +:meth:`.skb.apply_func() `: + +>>> def with_upper_columns(df): +... new_columns = [c.upper() for c in df.columns] +... return df.set_axis(new_columns, axis="columns") + +>>> orders.skb.apply_func(with_upper_columns) + +Result: +――――――― + ITEM PRICE QTY +0 pen 1.5 1 +1 cup NaN 1 +2 pen 1.5 2 +3 fork 2.2 4 + +Unpacking multiple outputs from deferred functions +-------------------------------------------------- + +When a deferred function returns more than one value, you cannot unpack the +result directly because unpacking iterates over the result. Iteration is not +supported on DataOps until evaluation. + +In general, it is recommended that deferred functions return a single +value whenever possible. Returning multiple outputs should be avoided unless +strictly necessary, as it makes downstream usage more complex. + +Instead, keep the result as a single DataOp and index into it: + +>>> test = skrub.var("test", [1, 2]) +>>> @skrub.deferred +... def process_test_data(test): +... left = test[0] +... right = test[1] +... return left, right +>>> res = test.skb.apply_func(process_test_data) +>>> left = res[0] +>>> right = res[1] + +:func:`deferred` is useful not only for our own functions, but also when we +need to call module-level functions from a library. For example, to delay the +loading of a CSV file, we could write something like: + +>>> csv_path = skrub.var("csv_path") +>>> data = skrub.deferred(pd.read_csv)(csv_path) + +or, with ``apply_func``: + +>>> data = csv_path.skb.apply_func(pd.read_csv) + +Another consequence of the fact that DataOps are evaluated lazily (we are +building a pipeline, not immediately computing a single result), any +transformation that we apply must not modify its input, but leave it unchanged +and return a new value. + +Consider the transformers in a scikit-learn pipeline: each computes a new +result without modifying its input. + +>>> orders['total'] = orders['price'] * orders['qty'] +Traceback (most recent call last): + ... +TypeError: Do not modify a DataOp in-place. Instead, use a function that returns a new value. This is necessary to allow chaining several steps in a sequence of transformations. +For example if df is a pandas DataFrame: +df = df.assign(new_col=...) instead of df['new_col'] = ... + +Note the suggestion in the error message: using :meth:`pandas.DataFrame.assign`. +When we do need assignments or in-place transformations, we can put them in a +:func:`deferred` function. But we should make a (shallow) copy of the inputs and +return a new value. + +Finally, there are other situations where using :func:`deferred` can be helpful: + +- When we have many nodes in our graph and want to collapse a sequence of steps into + a single function call that appears as a single node. +- When certain function calls need to be deferred until the full computation + runs, because they depend on the runtime environment, or on objects that + cannot be pickled with the rest of the computation graph (for example, opening + and reading a file). + +.. rubric:: Examples + +- See :ref:`sphx_glr_auto_examples_data_ops_1110_data_ops_intro.py` for an introductory + example on how to use skrub DataOps on a single dataframe. +- See :ref:`sphx_glr_auto_examples_data_ops_1120_multiple_tables.py` for an example + of how skrub DataOps can be used to process multiple tables using dataframe APIs. +- See :ref:`sphx_glr_auto_examples_data_ops_1130_choices.py` for an example of + hyper-parameter tuning using skrub DataOps. diff --git a/skrub/_docs/modules/data_ops/basics/data_ops_vs_alternatives.rst b/skrub/_docs/modules/data_ops/basics/data_ops_vs_alternatives.rst new file mode 100644 index 000000000..bffd79b2d --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/data_ops_vs_alternatives.rst @@ -0,0 +1,67 @@ +.. currentmodule:: skrub + +.. _user_guide_data_ops_vs_alternatives: + +How do skrub Data Ops differ from the alternatives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Skrub DataOps and scikit-learn :class:`sklearn.pipeline.Pipeline` +==================================================================== + +Scikit-learn pipelines represent a linear sequence of transformations on one +table with a fixed number of rows. + +.. image:: ../../../_static/sklearn_pipeline.svg + :width: 500 + +Skrub DataOps, on the other hand, can manipulate any number of variables. +The transformation they perform is not a linear sequence but any Directed +Acyclic Graph of computations. Take the following example, where our task is to predict +item price in dollars: + +.. image:: ../../../_static/dataops_graph.svg + +- Here we use three input variables: two tables ("Items" and "Prices") and a + float ("euro_dollar_rate"). +- For this regression task, we have declared which intermediary step can be + considered as the design matrix X (shown in blue) and as the target y + (shown in orange). +- Akin to scikit-learn pipelines, we apply an estimator (Ridge) at the end of the + processing. + +The rest of this user guide will detail how the DataOps work. + + +Skrub DataOps and orchestrators like Apache Airflow +=================================================================== + +Skrub pipelines are not an `orchestrator `_ +and do not offer capabilities for scheduling runs or provisioning resources and +environments. Instead, they are a generalization of scikit-learn pipelines, which +can still be used within an orchestrator. + +Skrub DataOps and other skrub objects, like :func:`~skrub.tabular_pipeline` +=============================================================================== + +Skrub DataOps are built to maximize flexibility in the construction of complex +pre-processing and machine learning pipelines. On the other hand, the main intent +of skrub objects such as :func:`~skrub.tabular_pipeline` and +:class:`~skrub.TableVectorizer` is to provide interfaces that for common +pre-processing tasks, and simple and robust baselines for +machine learning. As a result, these objects are more opinionated and +less flexible than DataOps. + +However, it is possible to combine DataOps and regular skrub and scikit-learn +transformers to improve their flexibility, particularly in multi-table scenarios. + +Can I use library "x" with skrub DataOps? +========================================== + +Yes, skrub DataOps are designed to be "transparent", so that any method used by +the underlying data structures (e.g., Pandas or Polars) can be accessed directly: +check :ref:`user_guide_direct_access_ref` for more details. +All DataOps-specific operations are available through the ``.skb`` attribute, +which provides access to the DataOps namespace. Other library-specific methods +are available directly from the DataOp object, as if it were a regular object +(like a Pandas or Polars DataFrame or Series). diff --git a/skrub/_docs/modules/data_ops/basics/direct_access_methods.rst b/skrub/_docs/modules/data_ops/basics/direct_access_methods.rst new file mode 100644 index 000000000..a03116b3e --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/direct_access_methods.rst @@ -0,0 +1,81 @@ +.. currentmodule:: skrub +.. _user_guide_direct_access_ref: + +DataOps allow direct access to methods of the underlying data +============================================================= + +DataOps are designed to be flexible and allow direct access to the underlying data, +so that it is possible to use the APIs of the underlying data structures +(e.g., Pandas or Polars) directly: + +Suppose we want to process dataframes that look like this: + +>>> import pandas as pd +>>> orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "pen", "fork"], +... "price": [1.5, None, 1.5, 2.2], +... "qty": [1, 1, 2, 4], +... } +... ) +>>> orders_df + item price qty +0 pen 1.5 1 +1 cup NaN 1 +2 pen 1.5 2 +3 fork 2.2 4 + +We can create a skrub variable to represent that input: + +>>> import skrub +>>> orders = skrub.var("orders", orders_df) + +Because we know that a dataframe will be provided as input to the computation, we +can manipulate ``orders`` as if it were a regular dataframe. + +We can access its attributes: + +>>> orders.columns + +Result: +――――――― +Index(['item', 'price', 'qty'], dtype=...) + +Accessing items, indexing, slicing: + +>>> orders["item"].iloc[1:] + +Result: +――――――― +1 cup +2 pen +3 fork +Name: item, dtype: ... + +We can apply operators: + +>>> orders["price"] * orders["qty"] + +Result: +――――――― +0 1.5 +1 NaN +2 3.0 +3 8.8 +dtype: float64 + +We can call methods: + +>>> orders.assign(total=orders["price"] * orders["qty"]) + +Result: +――――――― + item price qty total +0 pen 1.5 1 1.5 +1 cup NaN 1 NaN +2 pen 1.5 2 3.0 +3 fork 2.2 4 8.8 + +Note that the original ``orders`` variable is not modified by the operations +above. Instead, each operation creates a new DataOp. DataOps cannot be +modified in-place, all operations that we apply must produce a new value. diff --git a/skrub/_docs/modules/data_ops/basics/using_previews.rst b/skrub/_docs/modules/data_ops/basics/using_previews.rst new file mode 100644 index 000000000..c7919096e --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/using_previews.rst @@ -0,0 +1,96 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_using_previews: + +Using previews for easier development and debugging +=================================================== + +To make interactive development easier without having to call ``eval()`` after +each step, it is possible to preview the result of a DataOp by passing a value +along with its name when creating a variable. + +>>> import skrub +>>> a = skrub.var("a", 10) # we pass the value 10 in addition to the name +>>> b = skrub.var("b", 6) +>>> c = a + b +>>> c # now the display of c includes a preview of the result + +Result: +――――――― +16 + +Previews are eager computations on the current data, and since they are computed +immediately they can spot errors early on: + +>>> import pandas as pd +>>> df = pd.DataFrame({"col": [1, 2, 3]}) +>>> a = skrub.var("a", df) # we pass the DataFrame as a value + +Next, we use the pandas ``drop`` column and try to drop a column without +specifying the axis: + +>>> a.drop("col") # doctest: +IGNORE_EXCEPTION_DETAIL +ELLIPSIS +Traceback (most recent call last): + ... +RuntimeError: Evaluation of '.drop()' failed. +You can see the full traceback above. The error message was: +KeyError: "['col'] not found in axis" + +Note that seeing results for the values we provided does *not* change the fact +that we are building a pipeline that we want to reuse, not just computing the +result for a fixed input. The displayed result is only preview of the output on +one example dataset. + +>>> c.skb.eval({"a": 3, "b": 2}) +5 + +It is not necessary to provide a value for every variable: it is however advisable +to do so when possible, as it allows to catch errors early on. + +Note: you can obtain the preview values with :meth:`DataOp.skb.get_data`, and +set different ones with :meth:`DataOp.skb.set_data`. + +Defining a default value for a variable +--------------------------------------- + +If we pass ``becomes_default=True`` to :func:`var`, the provided ``value`` is not +only an example value to use for previews but a default value for this variable +in all contexts -- then it is always optional to pass a value for it in the +environment, and if not found the default is used. + +>>> a = skrub.var('a', 0) +>>> a + +Result: +――――――― +0 +>>> b = skrub.var('b', 1, becomes_default=True) +>>> b + +Result (also the default value): +―――――――――――――――――――――――――――――――― +1 +>>> c = a + b +>>> c.skb.eval({'a': 10}) # the default 1 is used for 'b' +11 + +See the documentation of :func:`var` for details. + +Disabling previews and eager checks +----------------------------------- + +By default, as soon as a DataOp is defined, some validity checks are performed +and the preview results are computed eagerly. In very complex DataOps plans +(100+ nodes), running checks after adding each node can cause a noticeable overhead. +To avoid this, it is possible to disable eager checks with the ``"eager_data_ops"`` +is easily achieved with the ``"eager_data_ops"`` :ref:`configuration +` option. + + +>>> with skrub.config_context(eager_data_ops=False): +... # no checks are performed when b is defined so no error in the line below: +... b = skrub.var('a', 1) + skrub.var('a', 2) +... # checks are still performed (once) before the DataOp is actually used so +... # evaluating the DataOp, using .skb.make_learner() etc _would_ still raise: +... # b.skb.eval() ## raises ValueError: Choice and node names must be unique. +>>> b # Note there is no preview, even though we provided values for the variables + diff --git a/skrub/_docs/modules/data_ops/basics/what_are_data_ops.rst b/skrub/_docs/modules/data_ops/basics/what_are_data_ops.rst new file mode 100644 index 000000000..8392cc279 --- /dev/null +++ b/skrub/_docs/modules/data_ops/basics/what_are_data_ops.rst @@ -0,0 +1,34 @@ +.. currentmodule:: skrub + +.. _user_guide_data_ops_intro: + + +Basics of DataOps: the DataOps plan, variables, and learners +=============================================================== + +**DataOps** are special objects that encapsulate operations on data (such as +applying operators, or calling methods) to record the parameters so that they +can later be replayed on new data. DataOps objects can be combined into a +DataOps plan, which is a directed acyclic graph (DAG) of operations. + +DataOps have a ``.skb`` attribute that provides access to the DataOps namespace, +which contains methods for evaluating the DataOps plan, exporting the plan as a +**learner**, and various other utilities. Any other operation on a DataOp that is +not part of the DataOps namespace is instead applied to the underlying data: this +allows, for example, to make use of Pandas or Polars methods if the DataOp is +encapsulating a DataFrame or Series. + +The entry point of any DataOps plan is :class:`~skrub.var`, +a **variable**: a variable is an input to +our machine learning pipeline, such as a table of data, a target array, or more +generic data such as paths to files, or timestamps. + +Variables can be combined using operators and function calls to build more +complex DataOps plans. The plan is constructed implicitly as we apply these +operations, rather than by specifying an explicit list of transformations. + +At any point in the DataOps plan, we can export the resulting computation graph +as a **learner** with :meth:`~skrub.DataOp.skb.make_learner()`. A learner is a +special object akin to a scikit-learn estimator, but that takes as input a +dictionary of variables rather than a single design matrix ``X`` and a target array +``y``. diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/applying_different_transformers.rst b/skrub/_docs/modules/data_ops/ml_pipeline/applying_different_transformers.rst new file mode 100644 index 000000000..54901bee1 --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/applying_different_transformers.rst @@ -0,0 +1,153 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_applying_different_transformers: + +Applying different transformers using skrub selectors and DataOps +================================================================= + +It is possible to use skrub selectors to define which columns to apply +transformers to, and then apply different transformers to different subsets of +the data. + +For example, this can be useful to apply :class:`~skrub.TextEncoder` to columns +that contain free-flowing text, and :class:`~skrub.StringEncoder` to other string +columns that contain categorical data such as country names. + +Or, a string column may need to be encoded in an ordered way, like in the following +example with grades. + +>>> import skrub +>>> import pandas as pd +>>> data = { +... "subject": ["Math", "English", "History", "Science", "Art"], +... "grade": ["A", "B", "C", "A", "B"] +... } +>>> df = pd.DataFrame(data) +>>> grades = skrub.var("grades", df) +>>> grades + +Result: +――――――― + subject grade +0 Math A +1 English B +2 History C +3 Science A +4 Art B + +We encode the subjects with the :class:`~skrub.StringEncoder`: + +>>> from skrub import StringEncoder +>>> enc_subject = grades.skb.select(cols="subject").skb.apply(StringEncoder(n_components=2)) + +For the grades, we define a :func:`~skrub.deferred` function that maps the strings +to the order we want. +Remember that objects inside deferred functions are regular Python +objects (more detail in :ref:`user_guide_data_ops_control_flow`). + +>>> @skrub.deferred +... def encode_ordered(df): +... grade_order = {"A": 3, "B": 2, "C": 1} +... return df["grade"].map(grade_order) +>>> enc_grades = grades.skb.apply_func(encode_ordered) +>>> enc_grades + +Result: +――――――― +0 3 +1 2 +2 1 +3 3 +4 2 +Name: grade, dtype: int64 + +Finally, we combine the resulting dataframe and series using another deferred +function. + +>>> @skrub.deferred +... def combine(subjects, grades): +... subjects["grade"] = grades +... return subjects +>>> combine(enc_subject, enc_grades) # doctest: +SKIP + +Result: +――――――― + subject_0 subject_1 grade +0 1.800470e-07 1.704487e+00 3 +1 1.675736e-07 -1.998386e-08 2 +2 1.615310e+00 2.142048e-07 1 +3 -4.709333e-08 5.155605e-08 3 +4 -5.441046e-01 4.167525e-09 2 + + +In the next example, we apply a :class:`~skrub.StringEncoder` to columns +with high cardinality, a mathematical operation to columns with nulls, and a +:class:`~skrub.TableVectorizer` to all other columns. We use the skrub +:ref:`selectors ` to select the columns based on our requirements. + +>>> import pandas as pd +>>> import skrub +>>> orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "pen", "fork"], +... "price": [1.5, None, 1.5, 2.2], +... "qty": [1, 1, 2, 4], +... } +... ) +>>> orders = skrub.var("orders", orders_df) +>>> orders + +Result: +――――――― + item price qty +0 pen 1.5 1 +1 cup NaN 1 +2 pen 1.5 2 +3 fork 2.2 4 + +We create some selectors with different conditions: + +>>> from skrub import selectors as s +>>> high_cardinality = s.string() - s.cardinality_below(2) +>>> has_nulls = s.has_nulls() +>>> leftover = s.all() - high_cardinality - has_nulls + +>>> vectorizer = skrub.StringEncoder(n_components=2) +>>> vectorized_items = orders.skb.select(high_cardinality).skb.apply(vectorizer) +>>> vectorized_items # doctest: +SKIP + +Result: +――――――― + item_0 item_1 price qty +0 1.511858e+00 9.380015e-08 1.5 1 +1 -1.704687e-07 1.511858e+00 NaN 1 +2 1.511858e+00 9.380015e-08 1.5 2 +3 -5.458670e-09 -6.917769e-08 2.2 4 + +>>> vectorized_has_nulls = orders.skb.select(cols=has_nulls) * 11 +>>> vectorized_has_nulls + + Result: + ――――――― + price + 0 16.5 + 1 NaN + 2 16.5 + 3 24.2 +>>> everything_else = orders.skb.select(cols=leftover).skb.apply(skrub.TableVectorizer()) + +After encoding the columns, the resulting DataOps can be concatenated together +to obtain the final result: + +>>> encoded = ( +... everything_else.skb.concat([vectorized_items, vectorized_has_nulls], axis=1) +... ) +>>> encoded # doctest: +SKIP + qty item_0 item_1 price +0 1.0 1.594282e+00 -1.224524e-07 16.5 +1 1.0 9.228692e-08 1.473794e+00 NaN +2 2.0 1.594282e+00 -1.224524e-07 16.5 +3 4.0 7.643604e-09 6.080018e-01 24.2 + +More info on advanced column selection and manipulation be found in +:ref:`user_guide_selectors` and example +:ref:`sphx_glr_auto_examples_0090_apply_to_cols.py`. diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/applying_ml_estimators.rst b/skrub/_docs/modules/data_ops/ml_pipeline/applying_ml_estimators.rst new file mode 100644 index 000000000..f0bceb017 --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/applying_ml_estimators.rst @@ -0,0 +1,66 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_applying_ml_estimators: + +Applying machine-learning estimators +===================================== + +In addition to working directly with the API provided by the underlying data, +DataOps can also be used to apply machine-learning estimators from +scikit-learn or skrub to the data. This is done through the +:func:`.skb.apply() ` method: + +>>> import pandas as pd +>>> import skrub +>>> orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "pen", "fork"], +... "price": [1.5, None, 1.5, 2.2], +... "qty": [1, 1, 2, 4], +... } +... ) +>>> orders = skrub.var("orders", orders_df) + +>>> orders.skb.apply(skrub.TableVectorizer()) + +Result: +――――――― + item_cup item_fork item_pen price qty +0 0.0 0.0 1.0 1.5 1.0 +1 1.0 0.0 0.0 NaN 1.0 +2 0.0 0.0 1.0 1.5 2.0 +3 0.0 1.0 0.0 2.2 4.0 + +It is also possible to apply a transformer to a subset of the columns. The ``cols`` +parameter can also use a skrub :ref:`selector ` for finer +grained control. +Note that any column that is not selected is passed through unchanged, like below: + +>>> vectorized_orders = orders.skb.apply( +... skrub.StringEncoder(n_components=3), cols="item" +... ) +>>> vectorized_orders # doctest: +SKIP + +Result: +――――――― + item_0 item_1 item_2 price qty +0 9.999999e-01 1.666000e-08 4.998001e-08 1.5 1 +1 -1.332800e-07 -1.199520e-07 1.000000e+00 NaN 1 +2 9.999999e-01 1.666000e-08 4.998001e-08 1.5 2 +3 3.942477e-08 9.999999e-01 7.884953e-08 2.2 4 + +Then, we can export the transformation as a learner with +:meth:`.skb.make_learner() ` + +>>> learner = vectorized_orders.skb.make_learner(fitted=True) +>>> new_orders = pd.DataFrame({"item": ["fork"], "price": [2.2], "qty": [5]}) +>>> learner.transform({"orders": new_orders}) # doctest: +SKIP + item_0 item_1 item_2 price qty +0 5.984116e-09 1.0 -1.323546e-07 2.2 5 + +Note that here the learner is **fitted** on the preview data, but in general it can +be exported without fitting, and then fitted on new data provided as an environment +dictionary. By default, the learner is returned without fitting. + +>>> learner = vectorized_orders.skb.make_learner() +>>> learner.fit({"orders": orders_df}) +SkrubLearner(data_op=) diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/documenting_data_ops_plan.rst b/skrub/_docs/modules/data_ops/ml_pipeline/documenting_data_ops_plan.rst new file mode 100644 index 000000000..cb05d1010 --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/documenting_data_ops_plan.rst @@ -0,0 +1,40 @@ +.. currentmodule:: skrub +.. _user_guide_documenting_data_ops_plan: + +Documenting the DataOps plan with node names and descriptions +============================================================= + +We can improve the readability of the DataOps plan by giving names and descriptions +to the nodes in the plan. This is done with :meth:`.skb.set_name() ` +and :meth:`.skb.set_description() `. + +>>> import skrub +>>> a = skrub.var('a', 1) +>>> b = skrub.var('b', 2) +>>> c = (a + b).skb.set_description('the addition of a and b') +>>> c.skb.description +'the addition of a and b' +>>> d = c.skb.set_name('d') +>>> d.skb.name +'d' + +Both names and descriptions can be used to mark relevant parts of the learner, and +they can be accessed from the computational graph and the plan report. + +Additionally, names can be used to bypass the computation of a node and override its +result by passing it as a key in the ``environment`` dictionary. + +>>> e = d * 10 +>>> e + +Result: +――――――― +30 +>>> e.skb.eval() +30 +>>> e.skb.eval({'a': 10, 'b': 5}) +150 +>>> e.skb.eval({'d': -1}) # -1 * 10 +-10 + +More info can be found in section :ref:`user_guide_data_ops_truncating_dataplan`. diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/evaluating_debugging_data_ops.rst b/skrub/_docs/modules/data_ops/ml_pipeline/evaluating_debugging_data_ops.rst new file mode 100644 index 000000000..60440ffcd --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/evaluating_debugging_data_ops.rst @@ -0,0 +1,32 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_evaluating_debugging_dataops: + +Evaluating and debugging the DataOps plan with :meth:`.skb.full_report() ` +================================================================================================== + +All operations on DataOps are recorded in a computational graph, which can be +inspected with :meth:`.skb.full_report() `. This method +generates an HTML report that shows the full plan, including all nodes, their names, +descriptions, and the transformations applied to the data. It is possible to give a +title to the evaluation report this way: +``my_data_op.skb.full_report(title="my title")``. + +An example of the report can be found +`here <../../../_static/credit_fraud_report/index.html>`_. + +For each node in the plan, the report shows: + +- The name and the description of the node, if present. +- Predecessor and successor nodes in the computational graph. +- Where the code relative to the node is defined. +- The estimator fitted in the node along with its parameters (if applicable). +- The preview of the data at that node. + +Additionally, if computations fail in the plan, the report shows the offending +node and the error message, which can help in debugging the plan. + +By default, reports are saved in the ``skrub_data/execution_reports`` directory, but +they can be saved to a different location with the ``output_dir`` parameter. +Note that the default path can be altered with the +``SKRUB_DATA_DIR`` environment variable. See :ref:`user_guide_configuration_parameters` +for more details. diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/subsampling_data.rst b/skrub/_docs/modules/data_ops/ml_pipeline/subsampling_data.rst new file mode 100644 index 000000000..51a045feb --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/subsampling_data.rst @@ -0,0 +1,29 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_subsampling: + +Subsampling data for easier development and debugging +===================================================== + +If the data used for the preview is large, it can be useful to work on a +subsample of the data to speed up the development and debugging process. +This can be done by calling the :meth:`.skb.subsample() ` method +on a variable: this signals to skrub that what is shown when printing DataOps, or +returned by :meth:`.skb.preview() ` is computed on a subsample +of the data. + +Note that subsampling is "local": if it is applied to a variable, it only +affects the variable itself. This may lead to unexpected results and errors +if, for example, ``X`` is subsampled but ``y`` is not. + +Subsampling **is turned off** by default when we call other methods such as +:meth:`.skb.eval() `, +:meth:`.skb.cross_validate() `, +:meth:`.skb.train_test_split `, +:meth:`DataOp.skb.make_learner`, +:meth:`DataOp.skb.make_randomized_search`, etc. +However, all of those methods have a ``keep_subsampling`` parameter that we can +set to ``True`` to force using the subsampling when we call them. Note that +even if we set ``keep_subsampling=True``, subsampling is not applied when using +``predict``. + +See more details in a :ref:`full example `. diff --git a/skrub/_docs/modules/data_ops/ml_pipeline/using_part_of_data_ops_plan.rst b/skrub/_docs/modules/data_ops/ml_pipeline/using_part_of_data_ops_plan.rst new file mode 100644 index 000000000..83f5edd82 --- /dev/null +++ b/skrub/_docs/modules/data_ops/ml_pipeline/using_part_of_data_ops_plan.rst @@ -0,0 +1,80 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_truncating_dataplan: + +Using only a part of a DataOps plan +=================================== + +Besides documenting a DataOps plan, the :meth:`.skb.set_name() ` +has additional functions. By setting a name, we can: + +- Bypass the computation of that node and override its result by passing it as a + key in the ``environment`` argument. +- Truncate the computational graph after this node to obtain the intermediate result with + :meth:`SkrubLearner.truncated_after`. +- Retrieve that node and inspect the estimator that was fitted in it, if the + node was created with :meth:`.skb.apply() `. + +Here is a toy example with 4 steps: + +>>> def load_data(url): +... print("load: ", url) +... return [1, 2, 3, 4] + + +>>> def transform(x): +... print("transform") +... return [item * 10 for item in x] + + +>>> def agg(x): +... print("agg") +... return max(x) + + +>>> import skrub +>>> url = skrub.var("url") +>>> output = ( +... url.skb.apply_func(load_data) +... .skb.set_name("loaded") +... .skb.apply_func(transform) +... .skb.set_name("transformed") +... .skb.apply_func(agg) +... ) + +Above, we give a name to each intermediate result with ``.skb.set_name()`` so +that we can later refer to it when manipulating a fitted learner. + +>>> learner = output.skb.make_learner() +>>> learner.fit({"url": "file:///example.db"}) +load: file:///example.db +transform +agg +SkrubLearner(data_op=) + +>>> learner.transform({"url": "file:///example.db"}) +load: file:///example.db +transform +agg +40 + +Below, we bypass the data loading. Because we directly provide a value for the +intermediate result that we named ``"loaded"``, the corresponding computation is +skipped and the provided value is used instead. We can see that +``"load: ..."`` is not printed and that the rest of the computation proceeds +using ``[6, 5, 4]`` (instead of ``[1, 2, 3, 4]`` as before). + +>>> learner.transform({"loaded": [6, 5, 4]}) +transform +agg +60 + +Now we show how to stop at the result we named ``"transformed"``. With +``truncated_after``, we obtain a learner that computes that intermediate result +and returns it instead of applying the last transformation; note that ``"agg"`` +is not printed and we get the output of ``transform()``, not of ``agg()``: + +>>> truncated = learner.truncated_after("transformed") +>>> truncated.transform({"url": "file:///example.db"}) +load: file:///example.db +transform +[10, 20, 30, 40] diff --git a/skrub/_docs/modules/data_ops/validation/exporting_data_ops.rst b/skrub/_docs/modules/data_ops/validation/exporting_data_ops.rst new file mode 100644 index 000000000..2e462c4e5 --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/exporting_data_ops.rst @@ -0,0 +1,66 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_exporting: + +Exporting the DataOps plan as a learner and reusing it +======================================================== + +DataOps are designed to build complex pipelines that can be reused on new, unseen +data in potentially different environments from where they were created. This can +be achieved by exporting the DataOps plan as a **learner**: the learner is special +transformer that is similar to a scikit-learn estimator, but that takes as input +the **environment** that should be used to execute the operations. The environment +is a dictionary of variables rather than a single design matrix +``X`` and a target array ``y``. + +>>> import pandas as pd +>>> orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "pen", "fork"], +... "price": [1.5, None, 1.5, 2.2], +... "qty": [1, 1, 2, 4], +... } +... ) +>>> import skrub +>>> from skrub import TableVectorizer +>>> orders = skrub.var("orders", orders_df) +>>> transformed_orders = orders.skb.apply(TableVectorizer()) +>>> learner = transformed_orders.skb.make_learner() + +The learner can be fitted as it is exported by setting ``fitted=True`` when +creating it with :meth:`.skb.make_learner() `. +This will fit the learner on the data used for previews when the variables are defined +(``orders_df`` in the case above): + +>>> learner = transformed_orders.skb.make_learner(fitted=True) + +Alternatively, the learner can be fitted on a different dataset by passing +the data to the ``fit()`` method: + +>>> new_orders_df = pd.DataFrame( +... { +... "item": ["pen", "cup", "spoon"], +... "price": [1.5, 2.0, 1.0 ], +... "qty": [1, 2, 3], +... } +... ) +>>> learner.fit({"orders": new_orders_df}) +SkrubLearner(data_op=) + + +The learner can be fitted and applied to new data +using the same methods as a scikit-learn estimator, such as ``fit()``, +``fit_transform()``, and ``predict()``. + +The learner can be pickled and saved to disk, so that it can be reused later +or in a different environment: + +>>> import pickle +>>> with open("learner.pkl", "wb") as f: +... pickle.dump(learner, f) +>>> with open("learner.pkl", "rb") as f: +... loaded_learner = pickle.load(f) +>>> loaded_learner.fit({"orders": new_orders_df}) +SkrubLearner(data_op=) + +See :ref:`sphx_glr_auto_examples_data_ops_1150_use_case.py` for an example of how +to use the learner in a microservice. diff --git a/skrub/_docs/modules/data_ops/validation/hyperparameter_tuning.rst b/skrub/_docs/modules/data_ops/validation/hyperparameter_tuning.rst new file mode 100644 index 000000000..0e2110c81 --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/hyperparameter_tuning.rst @@ -0,0 +1,227 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_hyperparameter_tuning: + +Using the skrub ``choose_*`` functions to tune hyperparameters +============================================================== + +skrub provides a convenient way to declare ranges of possible values, and tune +those choices to keep the values that give the best predictions on a validation +set. + +Rather than specifying a grid of hyperparameters separately from the pipeline, +we simply insert special skrub objects in place of the value. + +We define the same set of operations as before: + +>>> from sklearn.datasets import load_diabetes +>>> from sklearn.linear_model import Ridge +>>> import skrub +>>> diabetes_df = load_diabetes(as_frame=True)["frame"] +>>> data = skrub.var("data", diabetes_df) +>>> X = data.drop(columns="target", errors="ignore").skb.mark_as_X() +>>> y = data["target"].skb.mark_as_y() +>>> pred = X.skb.apply(Ridge(), y=y) + +Now, we can +replace the hyperparameter ``alpha`` (which should be a float) with a range +created by :func:`skrub.choose_float`. skrub can use it to select the best value +for ``alpha``. + + + +>>> pred = X.skb.apply( +... Ridge(alpha=skrub.choose_float(1e-6, 10.0, log=True, name="α")), y=y +... ) + +.. warning:: + + When we do :meth:`.skb.make_learner() `, the + pipeline we obtain does not perform any hyperparameter tuning. The pipeline + we obtain by default uses default values for each of the choices. For numeric + choices it is the middle of the range (unless an explicit default has been + set when creating the choice), and for :func:`choose_from` it is the first + option we give it. We can also obtain random choices, or choices suggested by + an Optuna :class:`trial `, by passing the ``choose`` + parameter. + + To get a pipeline that runs an internal cross-validation to select the best + hyperparameters, we must use :meth:`.skb.make_grid_search() + ` or :meth:`.skb.make_randomized_search() + `. We can also use `Optuna + `_ to choose the best hyperparameters as shown + in :ref:`this example `. + + +Here are the different kinds of choices, along with their default outcome when +we are not using hyperparameter search: + +.. _choice-defaults-table: + +.. list-table:: Default choice outcomes + :header-rows: 1 + + * - Choosing function + - Description + - Default outcome + * - :func:`choose_from([10, 20]) ` + - Choose between the listed options (10 and 20). + - First outcome in the list: ``10`` + * - :func:`choose_from({"a_name": 10, "b_name": 20}) ` + - Choose between the listed options (10 and 20). Dictionary keys serve as + names for the options. + - First outcome in the dictionary: ``10`` + * - :func:`optional(10) ` + - Choose between the provided value and ``None`` (useful for optional + transformations in a pipeline, e.g., ``optional(StandardScaler())``). + - The provided ``value``: ``10`` + * - :func:`choose_bool() ` + - Choose between True and False. + - ``True`` + * - :func:`choose_float(1.0, 100.0) ` + - Sample a floating-point number in a range. + - The middle of the range: ``50.5`` + * - :func:`choose_int(1, 100) ` + - Sample an integer in a range. + - The integer closest to the middle of the range: ``50`` + * - :func:`choose_float(1.0, 100.0, log=True) ` + - Sample a float in a range on a logarithmic scale. + - The middle of the range on a log scale: ``10.0`` + * - :func:`choose_int(1, 100, log=True) ` + - Sample an integer in a range on a logarithmic scale. + - The integer closest to the middle of the range on a log scale: ``10`` + * - :func:`choose_float(1.0, 100.0, n_steps=4) ` + - Sample a float on a grid. + - The step closest to the middle of the range: ``34.0`` (steps: ``[1.0, 34.0, 67.0, 100.0]``) + * - :func:`choose_int(1, 100, n_steps=4) ` + - Sample an integer on a grid. + - The step closest to the middle of the range: ``34`` (steps: ``[1, 34, 67, 100]``) + * - :func:`choose_float(1.0, 100.0, log=True, n_steps=4) ` + - Sample a float on a logarithmically spaced grid. + - The step closest to the middle of the range on a log scale: ``4.64`` + (steps: ``[1.0, 4.64, 21.54, 100.0]``) + * - :func:`choose_int(1, 100, log=True, n_steps=4) ` + - Sample an integer on a logarithmically spaced grid. + - The step closest to the middle of the range on a log scale: ``5`` + (steps: ``[1, 5, 22, 100]``) + + +The default choices for a DataOp, those that get used when calling +:meth:`.skb.make_learner() `, can be inspected with +:meth:`.skb.describe_defaults() `: + +>>> pred.skb.describe_defaults() +{'α': 0.00316...} + +We can then find the best hyperparameters. + +>>> search = pred.skb.make_randomized_search(fitted=True) +>>> search.results_ # doctest: +SKIP + α mean_test_score +0 0.000480 0.482327 +1 0.000287 0.482327 +2 0.000014 0.482317 +3 0.000012 0.482317 +4 0.000006 0.482317 +5 0.134157 0.478651 +6 0.249613 0.472019 +7 0.612327 0.442312 +8 2.664713 0.308492 +9 3.457901 0.275007 + +A human-readable description of parameters for a pipeline can be obtained with +:meth:`SkrubLearner.describe_params`: + +>>> search.best_learner_.describe_params() # doctest: +SKIP +{'α': 0.000479...} + +It is also possible to use :meth:`ParamSearch.plot_results` to visualize the results +of the search using a parallel coordinates plot. + +This could also be done with Optuna, either by passing ``backend='optuna'`` to +:meth:`DataOp.skb.make_randomized_search`, or by using Optuna directly: + +>>> import optuna # doctest: +SKIP +>>> def objective(trial): # doctest: +SKIP +... learner = pred.skb.make_learner(choose=trial) +... cv_results = skrub.cross_validate(learner, pred.skb.get_data()) +... return cv_results['test_score'].mean() +>>> study = optuna.create_study(direction="maximize") # doctest: +SKIP +>>> study.optimize(objective, n_trials=10) # doctest: +SKIP +>>> best_learner = pred.skb.make_learner(choose=study.best_trial) # doctest: +SKIP +>>> best_learner.describe_params() # doctest: +SKIP +{'α': 0.0006391165935023005} + + +Rather than fitting a randomized or grid search to find the best combination, it +is also possible to obtain an iterator over different parameter combinations to +inspect their outputs or to have manual control over the model selection. This can +be done with :meth:`.skb.iter_learners_grid() ` or +:meth:`.skb.iter_learners_randomized() ` ( +which yield the candidate pipelines that are explored by the grid and randomized +search respectively), or with the ``choose`` parameter of +:meth:`.skb.make_learner() `. + +A full example of how to use hyperparameter search is available in +:ref:`sphx_glr_auto_examples_data_ops_1130_choices.py`, and a full example using +Optuna is in :ref:`example_optuna_choices`. + +| + + +.. _user_guide_data_ops_feature_selection: + +Feature selection with skrub :class:`SelectCols` and :class:`DropCols` +======================================================================= +It is possible to combine :class:`SelectCols` and :class:`DropCols` with +:func:`choose_from` to perform feature selection by dropping specific columns +and evaluating how this affects the downstream performance. + +Consider this example. We first define the variable: + +>>> import pandas as pd +>>> import skrub.selectors as s +>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder +>>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]}) +>>> X = skrub.X(df) + +Then, we use the :ref:`skrub selectors ` to encode each +column with a different transformer: + +>>> X_enc = X.skb.apply(StandardScaler(), cols=s.numeric()).skb.apply( +... OneHotEncoder(sparse_output=False), cols=s.string() +... ) +>>> X_enc + +Result: +――――――― + number text_bar text_baz text_foo +0 -1.224745 0.0 0.0 1.0 +1 0.000000 1.0 0.0 0.0 +2 1.224745 0.0 1.0 0.0 + +Now we can use :class:`skrub.DropCols` to define two possible selection strategies: +first, we drop the column ``number``, then we drop all columns that start with +``text``. We rely again on the skrub selectors for this: + +>>> from skrub import DropCols +>>> drop = DropCols(cols=skrub.choose_from( +... {"number": s.cols("number"), "text": s.glob("text_*")}) +... ) +>>> X_enc.skb.apply(drop) + +Result: +――――――― + text_bar text_baz text_foo +0 0.0 0.0 1.0 +1 1.0 0.0 0.0 +2 0.0 1.0 0.0 + +We can see the generated parameter grid with :func:`DataOps.skb.describe_param_grid()`. + +>>> X_enc.skb.apply(drop).skb.describe_param_grid() +"- choose_from({'number': …, 'text': …}): ['number', 'text']\n" + +A more advanced application of this technique is used in +`this tutorial on forecasting timeseries `_, +along with the feature engineering required to prepare the columns, and the +analysis of the results. diff --git a/skrub/_docs/modules/data_ops/validation/nested_cross_validation.rst b/skrub/_docs/modules/data_ops/validation/nested_cross_validation.rst new file mode 100644 index 000000000..b9a5d81d0 --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/nested_cross_validation.rst @@ -0,0 +1,45 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_nested_cross_validation: + +Validating hyperparameter search with nested cross-validation +============================================================= + +To avoid overfitting hyperparameters, the best combination must be evaluated on +data that has not been used to select hyperparameters. This can be done with a +single train-test split or with nested cross-validation. + +Using the same examples as the previous sections: + +>>> from sklearn.datasets import load_diabetes +>>> from sklearn.linear_model import Ridge +>>> import skrub +>>> diabetes_df = load_diabetes(as_frame=True)["frame"] +>>> data = skrub.var("data", diabetes_df) +>>> X = data.drop(columns="target", errors="ignore").skb.mark_as_X() +>>> y = data["target"].skb.mark_as_y() +>>> pred = X.skb.apply( +... Ridge(alpha=skrub.choose_float(0.01, 10.0, log=True, name="α")), y=y +... ) + +Single train-test split: + +>>> split = pred.skb.train_test_split() +>>> search = pred.skb.make_randomized_search() +>>> search.fit(split['train']) +ParamSearch(data_op=, + search=RandomizedSearchCV(estimator=None, param_distributions=None)) +>>> search.score(split['test']) # doctest: +SKIP +0.4922874902029253 + +For nested cross-validation we use :func:`skrub.cross_validate`, which accepts a +``pipeline`` parameter (as opposed to +:meth:`.skb.cross_validate() ` +which always uses the default hyperparameters): + +>>> skrub.cross_validate(pred.skb.make_randomized_search(), pred.skb.get_data()) # doctest: +SKIP + fit_time score_time test_score +0 0.891390 0.002768 0.412935 +1 0.889267 0.002773 0.519140 +2 0.928562 0.003124 0.491722 +3 0.890453 0.002732 0.428337 +4 0.889162 0.002773 0.536168 diff --git a/skrub/_docs/modules/data_ops/validation/nesting_choices_choosing_pipelines.rst b/skrub/_docs/modules/data_ops/validation/nesting_choices_choosing_pipelines.rst new file mode 100644 index 000000000..d19f7a271 --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/nesting_choices_choosing_pipelines.rst @@ -0,0 +1,110 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_nesting_choices: + +Going beyond estimator hyperparameters: nesting choices and choosing pipelines +------------------------------------------------------------------------------ + +Choices are not limited to scikit-learn hyperparameters: we can use choices +wherever we use DataOps. The choice of the estimator to use, any argument of +a DataOp's method or :func:`deferred` function call, etc. can be replaced +with choices. We can also choose between several DataOps to compare +different pipelines. + +As an example of choices outside of scikit-learn estimators, we can consider +several ways to perform an aggregation on a pandas DataFrame: + +>>> import skrub +>>> ratings = skrub.var("ratings") +>>> agg_ratings = ratings.groupby("movieId")["rating"].agg( +... skrub.choose_from(["median", "mean"], name="rating_aggregation") +... ) +>>> print(agg_ratings.skb.describe_param_grid()) +- rating_aggregation: ['median', 'mean'] + +We can also choose between several completely different pipelines by turning a +choice into a DataOp, via its ``as_data_op`` method (or by using +:func:`as_data_op` on any object). + +>>> from sklearn.preprocessing import StandardScaler +>>> from sklearn.ensemble import RandomForestRegressor +>>> from sklearn.datasets import load_diabetes +>>> from sklearn.linear_model import Ridge +>>> import skrub +>>> diabetes_df = load_diabetes(as_frame=True)["frame"] +>>> data = skrub.var("data", diabetes_df) +>>> X = data.drop(columns="target", errors="ignore").skb.mark_as_X() +>>> y = data["target"].skb.mark_as_y() + +>>> ridge_pred = X.skb.apply(skrub.optional(StandardScaler())).skb.apply( +... Ridge(alpha=skrub.choose_float(0.01, 10.0, log=True, name="α")), y=y +... ) +>>> rf_pred = X.skb.apply( +... RandomForestRegressor(n_estimators=skrub.choose_int(5, 50, name="N 🌴")), y=y +... ) +>>> pred = skrub.choose_from({"ridge": ridge_pred, "rf": rf_pred}).as_data_op() +>>> print(pred.skb.describe_param_grid()) +- choose_from({'ridge': …, 'rf': …}): 'ridge' + optional(StandardScaler()): [StandardScaler(), None] + α: choose_float(0.01, 10.0, log=True, name='α') +- choose_from({'ridge': …, 'rf': …}): 'rf' + N 🌴: choose_int(5, 50, name='N 🌴') + +Also note that as seen above, choices can be nested arbitrarily. For example it +is frequent to choose between several estimators, each of which contains choices +in its hyperparameters. + +| + + +Linking choices depending on other choices +------------------------------------------ + +Choices can depend on another choice made with :func:`choose_from`, +:func:`choose_bool` or :func:`optional` through those objects' ``.match()`` +method. + +Suppose we want to use either ridge regression, random forest or gradient +boosting, and that we want to use imputation for ridge and random forest (only), +and scaling for the ridge (only). We can start by choosing the kind of +estimators and make further choices depend on the estimator kind: + +>>> import skrub +>>> from sklearn.impute import SimpleImputer, KNNImputer +>>> from sklearn.preprocessing import StandardScaler, RobustScaler +>>> from sklearn.linear_model import Ridge +>>> from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor + +>>> estimator_kind = skrub.choose_from( +... ["ridge", "random forest", "gradient boosting"], name="estimator" +... ) +>>> imputer = estimator_kind.match( +... {"gradient boosting": None}, +... default=skrub.choose_from([SimpleImputer(), KNNImputer()], name="imputer"), +... ) +>>> scaler = estimator_kind.match( +... {"ridge": skrub.choose_from([StandardScaler(), RobustScaler()], name="scaler")}, +... default=None, +... ) +>>> predictor = estimator_kind.match( +... { +... "ridge": Ridge(), +... "random forest": RandomForestRegressor(), +... "gradient boosting": HistGradientBoostingRegressor(), +... } +... ) +>>> pred = skrub.X().skb.apply(imputer).skb.apply(scaler).skb.apply(predictor) +>>> print(pred.skb.describe_param_grid()) +- estimator: 'ridge' + imputer: [SimpleImputer(), KNNImputer()] + scaler: [StandardScaler(), RobustScaler()] +- estimator: 'random forest' + imputer: [SimpleImputer(), KNNImputer()] +- estimator: 'gradient boosting' + +Note that only relevant choices are included in each subgrid. For example, when +the estimator is ``'random forest'``, the subgrid contains several options for +imputation but not for scaling. + +In addition to ``match``, choices created with :func:`choose_bool` have an +``if_else()`` method which is a convenience helper equivalent to +``match({True: ..., False: ...})``. diff --git a/skrub/_docs/modules/data_ops/validation/tuning_validating_data_ops.rst b/skrub/_docs/modules/data_ops/validation/tuning_validating_data_ops.rst new file mode 100644 index 000000000..9cbd371cb --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/tuning_validating_data_ops.rst @@ -0,0 +1,266 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_tuning_validating_dataops: + +Tuning and validating skrub DataOps plans +========================================= + +To evaluate the prediction performance of our plan, we can fit it on a training +dataset, then obtaining prediction on an unseen, test dataset. + +In scikit-learn, we pass to estimators and pipelines an ``X`` and ``y`` matrix +with one row per observation from the start. Therefore, we can split the +data into a training and test set independently from the pipeline. + +However, in many real-world scenarios, our data sources are not already +organized into ``X`` and ``y`` matrices. Some transformations may be necessary to +build them, and we want to keep those transformations inside the pipeline so +that they can be reliably re-applied to new data. + +Therefore, we must start our pipeline by creating the design matrix and targets, +then tell skrub which intermediate results in the pipeline constitute ``X`` and +``y`` respectively. + +Let us consider a toy example where we simply obtain ``X`` and +``y`` from a single table. More complex transformations would be handled in +the same way. + +>>> from sklearn.datasets import load_diabetes +>>> from sklearn.linear_model import Ridge +>>> import skrub + +>>> diabetes_df = load_diabetes(as_frame=True)["frame"] + +In the original data, all features and the target are in the same dataframe. + +>>> data = skrub.var("data", diabetes_df) + +We build our design matrix by dropping the target. Note we use +``errors="ignore"`` so that pandas does not raise an error if the column we want +to drop is already missing. Indeed, when we will need to make actual useful +predictions on unlabelled data, the "target" column will not be available. + +>>> X = data.drop(columns="target", errors="ignore").skb.mark_as_X() + +We use :meth:`.skb.mark_as_X() ` to indicate that this +intermediate result (the dataframe obtained after dropping "target") is the +``X`` design matrix. This is the dataframe that will be split into a training +and a testing part when we split our dataset or perform cross-validation. + +Similarly for ``y``, we use :meth:`.skb.mark_as_y() `: + +>>> y = data["target"].skb.mark_as_y() + +Now we can add our supervised estimator: + +>>> pred = X.skb.apply(Ridge(), y=y) +>>> pred # doctest: +SKIP + +Result: +――――――― + target +0 182.673354 +1 90.998607 +2 166.113476 +3 156.034880 +4 133.659575 +.. ... +437 180.323365 +438 135.798908 +439 139.855630 +440 182.645829 +441 83.564413 +[442 rows x 1 columns] + + +Once a pipeline is defined and the ``X`` and ``y`` nodes are identified, skrub +is able to split the dataset and perform cross-validation. + +Improving the confidence in our score through cross-validation +============================================================== + +We can increase our confidence in our score by using cross-validation instead of +a single split. The same mechanism is used but we now fit and evaluate the model +on several splits. This is done with :meth:`.skb.cross_validate() +`. + +>>> pred.skb.cross_validate() # doctest: +SKIP + fit_time score_time test_score +0 0.002816 0.001344 0.321665 +1 0.002685 0.001323 0.440485 +2 0.002468 0.001308 0.422104 +3 0.002748 0.001321 0.424661 +4 0.002649 0.001309 0.441961 + +.. _user_guide_data_ops_splitting_data: + +Splitting the data in train and test sets +========================================= + +We can use :meth:`.skb.train_test_split() ` to +perform a single train-test split. skrub first evaluates the DataOps on +which we used :meth:`.skb.mark_as_X() ` and +:meth:`.skb.mark_as_y() `: the first few steps of the +pipeline are executed until we have a value for ``X`` and for ``y``. +Then, those +dataframes are split using the provided split function (by default +scikit-learn's :func:`sklearn.model_selection.train_test_split`). + +>>> split = pred.skb.train_test_split(shuffle=False) +>>> split.keys() +dict_keys(['train', 'test', 'X_train', 'X_test', 'y_train', 'y_test']) + +``train`` and ``test`` are the full dictionaries corresponding to the training +and testing data. The corresponding ``X`` and ``y`` are the values, in those +dictionaries, for the nodes marked with +:meth:`.skb.mark_as_X() ` +and :meth:`.skb.mark_as_y() `. + +We can now fit our pipeline on the training data: + +>>> learner = pred.skb.make_learner() +>>> learner.fit(split["train"]) +SkrubLearner(data_op=) + +Only the training part of ``X`` and ``y`` are used. The subsequent steps are +evaluated, using this data, to fit the rest of the pipeline. + +And we can obtain predictions on the test part: + +>>> test_pred = learner.predict(split["test"]) +>>> test_y_true = split["y_test"] + +>>> from sklearn.metrics import r2_score + +>>> r2_score(test_y_true, test_pred) # doctest: +SKIP +0.440999149220359 + +It is possible to define a custom split function to use instead of +:func:`sklearn.model_selection.train_test_split`. + +Passing additional arguments to the splitter +============================================ + +Sometimes we want to pass additional data to the cross-validation splitter. + +For example, if there is a group structure in our data (such as sites, +hospitals, etc.) and we want the model to generalize to unseen groups, we must +ensure while evaluating it that each group goes entirely in the train set or the +test set, but is not divided among the 2. This can be done with +:class:`sklearn.model_selection.GroupKFold`, +:class:`sklearn.model_selection.LeavePGroupsOut`, etc. . The ``split`` function +of those objects accepts a ``groups`` parameter. We can compute the groups +inside of the DataOp and pass them to :meth:`DataOp.skb.mark_as_X` and they will +be passed to the splitter. + +>>> df = skrub.datasets.toy_products() +>>> df + description price seller category +0 screen 100 supermarket.com electronics +1 hammer 15 bestproducts.com tools +2 keyboard 20 supermarket.com electronics +3 usb key 9 bestproducts.com electronics +4 charger 13 bestproducts.com electronics +5 screwdriver 12 supermarket.com tools + +Suppose we want to assess generalization to new sellers. While splitting for +cross-validation we must group products by seller. We do it with +:class:`sklearn.model_selection.LeaveOneGroupOut`. + +>>> from sklearn.dummy import DummyClassifier +>>> from sklearn.model_selection import LeaveOneGroupOut + +>>> data = skrub.var("df", df) +>>> groups = data["seller"] +>>> X = data[["description", "price"]].skb.mark_as_X( +... cv=LeaveOneGroupOut(), split_kwargs={"groups": groups} +... ) +>>> y = data["category"].skb.mark_as_y() +>>> pred = X.skb.apply(DummyClassifier(), y=y) +>>> split = pred.skb.train_test_split() + +The train set only contains data from the "supermarket.com" seller. + +>>> split["X_train"] + description price +0 screen 100 +2 keyboard 20 +5 screwdriver 12 + +The test set only contains data from the "bestproducts.com" seller. + +>>> split["X_test"] + description price +1 hammer 15 +3 usb key 9 +4 charger 13 + +Passing additional arguments to the scorer +========================================== + +Sometimes we have additional information to pass to the scorer such as sample +weights, group information etc. + +We can control how scoring is performed by using +:meth:`DataOp.skb.with_scoring`. It has a ``scoring`` parameter, which can be +anything scikit-learn's :func:`~sklearn.model_selection.cross_validate` accepts +for ``scoring`` such as a metric name, callable scorer, or dict mapping metric +names to scorers (see the reference documentation of +:meth:`DataOp.skb.with_scoring` for details). + +It also accepts a ``kwargs`` argument, which are passed to the scorer when +evaluating the learner. + +Importantly, the ``scoring`` and ``kwargs`` can be DataOps, which will be +computed when scoring the learner -- so for example, sample weights can be +computed dynamically. + +Using the same toy dataset as above, suppose we want to give more weight to more +expensive products: + +>>> X = data[["description", "price"]].skb.mark_as_X(cv=2) +>>> y = data["category"].skb.mark_as_y() +>>> pred = X.skb.apply(DummyClassifier(), y=y) + +The default score is the (unweighted) accuracy: + +>>> pred.skb.cross_validate() # doctest: +SKIP + fit_time score_time test_score +0 0.003982 0.002405 0.666667 +1 0.002582 0.002169 0.666667 + +We set the scoring to provide the sample weights: + +>>> sample_weight = X["price"] +>>> pred.skb.with_scoring( +... "accuracy", kwargs={"sample_weight": sample_weight} +... ).skb.cross_validate() # doctest: +SKIP + fit_time score_time test_accuracy +0 0.003045 0.003275 0.888889 +1 0.002659 0.003026 0.647059 + +Besides passing extra arguments, :meth:`DataOp.skb.with_scoring` can also be +useful to control what should be used as the default scoring metric for our +learner, just as the ``cv`` parameter of :meth:`DataOp.skb.mark_as_X` defines +the default cross-validation splitting strategy. + +>>> split = pred.skb.train_test_split() +>>> learner = pred.skb.with_scoring('neg_log_loss').skb.make_learner() +>>> learner.fit(split['train']) +SkrubLearner(data_op= (1 scorers)> + This DataOp will be scored with: + - 'neg_log_loss' + Use .skb.cross_validate(…) or .skb.make_learner(…).score(…) to compute scores.) +>>> learner.score(split['test']) # doctest: +SKIP +-0.6365141682948128 + +Note that the score above is negative: it is the negative log loss we passed to +``with_scoring``, and not the default score (accuracy, which would be positive). + +:meth:`DataOp.skb.with_scoring` only changes how scoring is performed +(the outputs of :meth:`DataOp.skb.cross_validate`, +:meth:`DataOp.skb.make_randomized_search`, :class:`SkrubLearner.score ` etc.), +**not** the actual outputs of the learner (it does _not_ affect the outputs of +:meth:`DataOp.skb.eval`, :class:`SkrubLearner.predict `, etc.) + +This method can be called several times to add scorers that take different +kwargs. See the reference documentation for details. diff --git a/skrub/_docs/modules/data_ops/validation/tuning_with_optuna.rst b/skrub/_docs/modules/data_ops/validation/tuning_with_optuna.rst new file mode 100644 index 000000000..be52c0637 --- /dev/null +++ b/skrub/_docs/modules/data_ops/validation/tuning_with_optuna.rst @@ -0,0 +1,219 @@ +.. currentmodule:: skrub +.. _user_guide_data_ops_tuning_optuna: + +.. |make_randomized_search| replace:: :func:`~skrub.DataOp.skb.make_randomized_search` + + +Tuning DataOps with Optuna +========================== + +Optuna is a powerful hyperparameter optimization framework that +can be used to efficiently search for the best hyperparameters for machine +learning models; Optuna includes both sophisticated search algorithms and +tools to monitor and visualize the optimization process. + +There are two main ways of using Optuna with skrub DataOps: either by using +Optuna as a ``backend`` in the +|make_randomized_search| +method, or by creating an Optuna study directly and using it to pick values for +skrub choices when calling :meth:`DataOp.skb.make_learner()`. + +.. note:: + + To use Optuna with skrub, you need to have Optuna installed in your Python + environment. You can install it using pip: + + .. code-block:: bash + + pip install optuna + + +Using Optuna as a backend for randomized search +------------------------------------------------- +The easiest way to use Optuna with skrub is to use it as a backend for +|make_randomized_search|. This allows us to leverage Optuna's advanced +sampling algorithms and features while keeping same the familiar interface as +for other search methods. + +We start by defining a DataOp containing choices: + +>>> import skrub +>>> from sklearn.datasets import make_classification +>>> from sklearn.linear_model import LogisticRegression +>>> from sklearn.feature_selection import SelectKBest +>>> from sklearn.ensemble import HistGradientBoostingClassifier +>>> from sklearn.dummy import DummyClassifier + +>>> X_a, y_a = make_classification(random_state=0) +>>> X, y = skrub.X(X_a), skrub.y(y_a) +>>> selector = SelectKBest(k=skrub.choose_int(4, 20, log=True, name='k')) +>>> logistic = LogisticRegression(C=skrub.choose_float(0.1, 10.0, log=True, name="C")) +>>> hgb = HistGradientBoostingClassifier( +... learning_rate=skrub.choose_float(.01, .5, log=True, name="learning_rate"), +... random_state=0, +... ) +>>> classifier = skrub.choose_from( +... {"logistic": logistic, "hgb": hgb, "dummy": DummyClassifier()}, name="classifier" +... ) +>>> pred = X.skb.apply(selector, y=y).skb.apply(classifier, y=y) +>>> print(pred.skb.describe_param_grid()) +- k: choose_int(4, 20, log=True, name='k') + classifier: 'logistic' + C: choose_float(0.1, 10.0, log=True, name='C') +- k: choose_int(4, 20, log=True, name='k') + classifier: 'hgb' + learning_rate: choose_float(0.01, 0.5, log=True, name='learning_rate') +- k: choose_int(4, 20, log=True, name='k') + classifier: 'dummy' + + +Now, we can create a randomized search using Optuna as the backend: + +>>> search = pred.skb.make_randomized_search(fitted=True, random_state=0, backend="optuna") # doctest: +SKIP +Running optuna search for study skrub_randomized_search_c4af73b2-45fb-49ca-9f06-092d74aa8118 in storage .../tmpuor7hqjm_skrub_optuna_search_storage/optuna_storage + +It is possible to access the same parameters as with the default backend: + +>>> search.results_ # doctest: +SKIP + k C learning_rate classifier mean_test_score +0 4 NaN 0.013146 hgb 0.93 +1 4 NaN 0.040454 hgb 0.92 +2 19 NaN 0.019968 hgb 0.92 +3 4 0.645966 NaN logistic 0.92 +4 4 NaN 0.023337 hgb 0.92 +5 8 NaN 0.097994 hgb 0.90 +6 9 NaN 0.104104 hgb 0.88 +7 14 0.391899 NaN logistic 0.81 +8 20 NaN NaN dummy 0.50 +9 9 NaN NaN dummy 0.50 + +The best learner and best hyperparameters can be accessed as usual: + +>>> search.best_learner_.describe_params() # doctest: +SKIP +{'k': 4, 'learning_rate': 0.01314593370942781, 'classifier': 'hgb'} + +|make_randomized_search| +accepts ``sampler`` and ``timeout`` parameters to customize the Optuna study. +Optuna studies feature a wide range of additional parameters, which can be accessed +by using Optuna directly with skrub learners, as shown in the next section. + +A more complete example that includes more advanced usage is available in +:ref:`example_optuna_choices`. + +Setting a storage for the Optuna study +-------------------------------------- +When using Optuna as a backend for hyperparameter search, it is possible to +specify a storage option to persist the study and its results. This allows us to +resume the search later or analyze the results after the search is complete. +This can be done by providing the ``storage`` parameter to +|make_randomized_search|. + +.. code-block:: python + + search = pred.skb.make_randomized_search( + fitted=True, + random_state=0, + backend="optuna", + storage="sqlite:///optuna_study.db", # Use a SQLite database file + ) + +If no storage is provided, a temporary storage is used during optimization, then +the study is moved to an in-memory storage once the search completes so the +resulting search object is self-contained. + +Using Optuna directly +--------------------- +It is also possible to use Optuna directly with skrub DataOps. This allows for more +flexibility and control over the optimization process, as we can define custom +objectives and leverage Optuna's advanced features, such as the ask-and-tell interface, +trial pruning, and multi-objective optimization. + +In this case, rather than running the hyperparameter search through +|make_randomized_search|, +the :class:`optuna.Study ` runs the hyperparameter +search by defining an objective function that uses a skrub +learner with hyperparameters suggested by Optuna. + +:meth:`optimize ` is given an +``objective`` function. The ``objective`` must accept a +:class:`~optuna.trial.Trial` object (which is produced by the study and picks +the parameters for a given evaluation of the objective) and return the value +to maximize (or minimize). + +To use Optuna with a :class:`DataOp`, we just need to pass the Trial object +to :meth:`DataOp.skb.make_learner`. This creates a :class:`SkrubLearner` +initialized with the parameters picked by the optuna Trial. + +We can then cross-validate the:class:`SkrubLearner`, or score it however we prefer, +and return the score so that the optuna Study can take it into account. + +Here we return a single score (R²), but multi-objective +optimization is also possible. Please refer to the Optuna documentation for +more information. + +>>> import optuna # doctest: +SKIP + +>>> def objective(trial): # doctest: +SKIP +... learner = pred.skb.make_learner(choose=trial) +... cv_results = skrub.cross_validate(learner, environment=pred.skb.get_data(), cv=4) +... return cv_results["test_score"].mean() + +>>> study = optuna.create_study(direction="maximize") # doctest: +SKIP +>>> study.optimize(objective, n_trials=16) # doctest: +SKIP +>>> best_params = study.best_params # doctest: +SKIP + +Then, we can create the best learner using the best trial found by Optuna: + +>>> best_learner = pred.skb.make_learner(choose=study.best_trial) # doctest: +SKIP + +The learner can also be defined as follows: + +>>> best_learner = pred.skb.make_learner() # doctest: +SKIP +>>> best_learner.set_params(**study.best_params) # doctest: +SKIP +SkrubLearner(data_op=) + +Then, we can inspect the parameters as usual: + +>>> best_learner.describe_params() # doctest: +SKIP +{'k': 12, 'learning_rate': 0.06401143720094754, 'classifier': 'hgb'} + +You can find a more complete example in :ref:`example_optuna_choices`. + + +Parallelism +----------- + +Optuna's :meth:`optuna.study.Study.optimize` uses thread-based parallelism. When +we use :meth:`DataOp.skb.make_randomized_search` with the Optuna backend, both +threading and multiprocessing can be used. Skrub will choose based on the joblib +configuration: if joblib is configured to use processes (the default), +parallelization is done with joblib, and if joblib is configured to use the +"threading" backend, Optuna's built-in thread-based parallelism is used instead. + +When the ``timeout`` parameter is used, Optuna's built-in, thread-based +parallelization is always used regardless of the joblib configuration. + + +Using the Optuna dashboard +-------------------------- +Optuna provides a dashboard that allows us to visualize +and monitor the optimization process in real-time. This can be especially useful +for long-running hyperparameter searches. +To use the Optuna Dashboard, we need to install it first: + +.. code-block:: bash + + pip install optuna-dashboard + +We can then start the dashboard by running the following command in the terminal: + +.. code-block:: bash + + optuna-dashboard STORAGE_URL + +Where ``STORAGE_URL`` is the same storage URL used in the Optuna study. + +We can then access the dashboard in our web browser at +``http://localhost:8080`` (by default). The dashboard provides various visualizations +and tools to analyze the optimization process, such as parameter importance, +optimization history, and parallel coordinate plots. diff --git a/skrub/_docs/modules/default_wrangling/apply_to_cols.rst b/skrub/_docs/modules/default_wrangling/apply_to_cols.rst new file mode 100644 index 000000000..fc9b6e07c --- /dev/null +++ b/skrub/_docs/modules/default_wrangling/apply_to_cols.rst @@ -0,0 +1,160 @@ +.. currentmodule:: skrub + +.. |ApplyToCols| replace:: :class:`ApplyToCols` +.. |TableVectorizer| replace:: :class:`TableVectorizer` +.. |selectors| replace:: :mod:`skrub.selectors` +.. |s.string| replace:: :meth:`~skrub.selectors.string` +.. |s.numeric| replace:: :meth:`~skrub.selectors.numeric` +.. |RejectColumn| replace:: :class:`core.RejectColumn` +.. |ToDatetime| replace:: :class:`ToDatetime` +.. |SingleColumnTransformer| replace:: :class:`~skrub.core.SingleColumnTransformer` +.. |StandardScaler| replace:: :class:`~sklearn.preprocessing.StandardScaler` +.. |OneHotEncoder| replace:: :class:`~sklearn.preprocessing.OneHotEncoder` +.. |OrdinalEncoder| replace:: :class:`~sklearn.preprocessing.OrdinalEncoder` +.. |make_pipeline| replace:: :class:`~sklearn.pipeline.make_pipeline` + +.. _user_guide_multiple_columns: + +Transforming only some columns with |ApplyToCols| +=========================================================== + +Very often and for various reasons, transformers must be applied only to some of the +columns in a dataframe. For example, all numeric columns in a dataframe may need +to be scaled at the same time, while string columns should be left alone. +While the heuristics used by the :class:`TableVectorizer` are usually good enough +to apply the proper transformers to different datatypes, using it may not be an +option in all cases. + +|ApplyToCols| (optionally paired with the |selectors|) allows to transform specific +columns with a large degree of control: |ApplyToCols| maps a transformer to columns +in a dataframe, so that all columns that satisfy a certain condition are transformed, +while the others are left untouched. |ApplyToCols| and the |selectors| are similar +to scikit-learn's :class:`~sklearn.compose.ColumnTransformer`. + + +Using selectors to choose or exclude columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a skrub transformer has a ``cols`` parameter to specify a column list, +that can be a selector as well. Selectors give more control over which columns +are being transformed: they are discussed at length in the +:ref:`selectors user guide`. + +|ApplyToCols| can be used to transform a subset of columns in a dataframe, while +leaving the non-selected columns unchanged. In this example, we want to apply +an |OrdinalEncoder| only on the text column, and a |StandardScaler| on the numeric +column. Columns that aren't selected are passed through unchanged, and this allows +to concatenate |ApplyToCols| transformers with |make_pipeline|. + +>>> import pandas as pd +>>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]}) + +We use the |s.string| selector to choose only the text column, and |s.numeric| +to select only the numeric column: + +>>> import skrub.selectors as s +>>> from skrub import ApplyToCols +>>> from sklearn.preprocessing import OrdinalEncoder, StandardScaler +>>> +>>> numeric = ApplyToCols(StandardScaler(), cols=s.numeric()) +>>> string = ApplyToCols(OrdinalEncoder(), cols=s.string()) + +We then concatenate the two with |make_pipeline|: + +>>> from sklearn.pipeline import make_pipeline +>>> transformed = make_pipeline(numeric, string).fit_transform(df) +>>> transformed + number text +0 -1.224745 2.0 +1 0.000000 0.0 +2 1.224745 1.0 + +If |ApplyToCols| is used with a transformer that inherits from +|SingleColumnTransformer|, or one that has the ``__single_column_transformer__`` +attribute, then the transformer will be cloned and applied separately to each +column. Most skrub transformers belong to this category. + +Here we want to apply |ToDatetime| to each of the datetime columns to convert +them to datetime dtype. |ApplyToCols| automatically detects that |ToDatetime| +should be applied to each column separately: + +>>> from skrub._to_datetime import ToDatetime +>>> df = pd.DataFrame({ +... 'date_1': ['2024-01-15', '2024-02-20', '2024-03-10'], +... 'date_2': ['2023-12-01', '2024-01-05', '2024-02-28'] +... }) +>>> df_enc = ApplyToCols(ToDatetime()).fit_transform(df) +>>> df_enc + date_1 date_2 +0 2024-01-15 2023-12-01 +1 2024-02-20 2024-01-05 +2 2024-03-10 2024-02-28 +>>> df_enc.dtypes +date_1 datetime64[...] +date_2 datetime64[...] +dtype: ... + +We can also combine |ApplyToCols| with |TableVectorizer| to only vectorize columns +specific columns and avoid others, like ID columns: + +>>> from skrub import TableVectorizer +>>> df = pd.DataFrame({ +... 'id': ["c1", "c2", "c3"], +... 'city': ['Paris', 'Rome', 'Madrid'], +... 'date': ['2023-01-15', '2023-02-20', '2023-03-10'] +... }) +>>> ApplyToCols(TableVectorizer(), cols=s.all() - "id").fit_transform(df) # doctest: +SKIP +id city_Madrid city_Paris city_Rome date_year date_month date_day date_total_seconds +0 c1 0.0 1.0 0.0 2023.0 1.0 15.0 1.673741e+09 +1 c2 0.0 0.0 1.0 2023.0 2.0 20.0 1.676851e+09 +2 c3 1.0 0.0 0.0 2023.0 3.0 10.0 1.678406e+09 + +Note that the column "id" was not encoded and was instead left as-is. + +Rejecting columns that cannot be handled by a transformer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +|ApplyToCols| can allow the underlying encoder to decide which columns it can be applied to. +For example, if we do not know in advance which columns can be transformed to datetime, +we can use |ApplyToCols| to map |ToDatetime| to all columns in a dataframe and pass +``allow_reject=True``. In that case, non-datetime columns. By default, all columns in +``cols`` must be transformed, and if one of them cannot be transformed an exception +will be raised and the transformation will fail. + +It is possible to change how rejected columns are handled through the ``allow_reject`` +parameter. +By default, no special handling is performed and rejections are considered +to be errors: + +>>> from skrub._to_datetime import ToDatetime +>>> df = pd.DataFrame(dict(birthday=["29/01/2024"], city=["London"])) +>>> df + birthday city +0 29/01/2024 London +>>> to_datetime = ApplyToCols(ToDatetime()) +>>> to_datetime.fit_transform(df) # doctest: +SKIP +Traceback (most recent call last): + ... +skrub.core.RejectColumn: Could not find a datetime format for column 'city'. +Transformer ToDatetime.fit_transform failed on column 'city'. See above for the full traceback. + +However, setting ``allow_reject=True`` gives the transformer itself some +control over which columns it should be applied to. For example, we can try to +parse all columns but allow +the transformer to reject those that, upon inspection, do not contain dates. + +>>> to_datetime = ApplyToCols(ToDatetime(), allow_reject=True) +>>> transformed = to_datetime.fit_transform(df) +>>> transformed + birthday city +0 2024-01-29 London + +>>> transformed.dtypes +birthday datetime64[...] +city ... +dtype: ... + +Advanced usage of |ApplyToCols| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For more advanced use cases, refer to the examples section of the |ApplyToCols| +docstring, and to :ref:`this user guide section `. diff --git a/skrub/_docs/modules/default_wrangling/cleaning_dataframes.rst b/skrub/_docs/modules/default_wrangling/cleaning_dataframes.rst new file mode 100644 index 000000000..e739d7832 --- /dev/null +++ b/skrub/_docs/modules/default_wrangling/cleaning_dataframes.rst @@ -0,0 +1,121 @@ +.. |DropUninformative| replace:: :class:`~skrub.DropUninformative` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |ToDatetime| replace:: :class:`~skrub.ToDatetime` + +.. _user_guide_cleaning_dataframes: + +|Cleaner|: sanitizing a dataframe +--------------------------------- + +Very often, the first steps in preparing a dataframe for further use involve +understanding the datatypes in the data and changing them into a more suitable format +(e.g., from string to number or datetime). + +The |Cleaner| aids with this by running common operations on each column, including +replacing "null-looking" strings (e.g., ``NULL``) with actual null values, and +parsing datetimes and numbers. + +.. admonition:: All the transformations done by the |Cleaner| + :collapsible: closed + + - Clean null strings: Replace strings typically used to represent missing values + with a null value suitable for the column under consideration. + + - |DropUninformative|: Drop the column if it is considered "uninformative." + A column is considered "uninformative" if it contains only missing values + (``drop_null_fraction``), or only a constant value (``drop_if_constant``). + By default, the |Cleaner| keeps all columns + unless they contain only missing values. Refer to :ref:`user_guide_drop_uninformative` + for more detail on this operation. + + - |ToDatetime|: Parse datetimes represented as strings and return them as + actual datetimes with the correct dtype. If ``datetime_format`` is provided, + it is forwarded to |ToDatetime|. Otherwise, the format is guessed according + to common datetime formats. + + - Convert to strings: Convert columns to strings unless they have a more informative + dtype, such as numeric, categorical, or datetime. + +If ``parse_numbers`` is set to ``True``, the ``Cleaner`` will parse +string columns that contain only numbers and convert them to ``float32``. +If ``cast_to_float32=True``, the ``Cleaner`` will also convert numeric columns +(e.g. ``float64``, ``int64``) to ``float32``. + +The |Cleaner| is a scikit-learn compatible transformer: + +>>> from skrub import Cleaner +>>> import pandas as pd +>>> df = pd.DataFrame({ +... "id": [1, 2, 3], +... "all_missing": ["", "", ""], +... "date": ["2024-05-05", "2024-05-06", "2024-05-07"], +... }) +>>> df_clean = Cleaner().fit_transform(df) +>>> df_clean + id date + 0 1 2024-05-05 + 1 2 2024-05-06 + 2 3 2024-05-07 +>>> df_clean.dtypes +id int64 +date datetime64[...] +dtype: ... + +Note that the ``"all_missing"`` column has been dropped, and that the ``"date"`` +column has been correctly parsed as a datetime column. + +Parsing numeric-looking strings with the |Cleaner| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, when the |Cleaner| encounters a string series that contains only +numeric-looking values (for example ``["1", "2", "3"]``), it leaves it +unchanged. + +The |Cleaner| can parse those values by setting ``parse_numbers=True``: + +>>> from skrub import Cleaner +>>> cleaner = Cleaner(parse_numbers=True) +>>> import pandas as pd +>>> df = pd.DataFrame({ +... "id_as_str": ["1", "2", "3"], +... "id": [1, 2, 3], +... }) +>>> df.dtypes +id_as_str ... +id int64 +dtype: ... +>>> df_cleaned = cleaner.fit_transform(df) +>>> df_cleaned.dtypes +id_as_str float32 +id int64 +dtype: ... + +Parsed string values are converted to ``float32`` (not to ``int64`` or +``float64``), to keep a consistent numeric representation that is compatible +with downstream scikit-learn transformers. + +When ``parse_numbers=False`` (default), both columns keep their original dtypes. + +Downcasting float dtypes to ``float32`` with the |Cleaner| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, floating-point columns (e.g. ``float64``) keep their original dtype. +To downcast numeric columns to ``float32``, set +``cast_to_float32=True``: + +>>> from skrub import Cleaner +>>> cleaner = Cleaner(cast_to_float32=True) +>>> import pandas as pd +>>> df = pd.DataFrame({ +... "f64": [1.0, 2.0, 3.0], +... "i64": [1, 2, 3], +... }) +>>> df.dtypes +f64 float64 +i64 int64 +dtype: ... +>>> cleaner.fit_transform(df).dtypes +f64 float32 +i64 float32 +dtype: ... diff --git a/skrub/_docs/modules/default_wrangling/table_vectorizer.rst b/skrub/_docs/modules/default_wrangling/table_vectorizer.rst new file mode 100644 index 000000000..a4b4be443 --- /dev/null +++ b/skrub/_docs/modules/default_wrangling/table_vectorizer.rst @@ -0,0 +1,152 @@ +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` +.. |DropUninformative| replace:: :class:`~skrub.DropUninformative` +.. |DatetimeEncoder| replace:: :class:`~skrub.DatetimeEncoder` +.. |StringEncoder| replace:: :class:`~skrub.StringEncoder` +.. |OneHotEncoder| replace:: :class:`~sklearn.preprocessing.OneHotEncoder` +.. |OrdinalEncoder| replace:: :class:`~sklearn.preprocessing.OrdinalEncoder` +.. |TextEncoder| replace:: :class:`~skrub.TextEncoder` +.. |ApplyToCols| replace:: :class:`~skrub.ApplyToCols` +.. |ToCategorical| replace:: :class:`~skrub.ToCategorical` + +.. _user_guide_table_vectorizer: + +Transforming a table into an array of numeric features: |TableVectorizer| +------------------------------------------------------------------------- + +In tabular machine learning pipelines, practitioners often convert categorical +features to numeric features using various encodings (|OneHotEncoder|, |OrdinalEncoder|, +etc.). + +The objective of the |TableVectorizer| is to take any dataframe as input, and +produce as output a feature-engineered version of the dataframe. + +Initially, the |TableVectorizer| parses the data type of each column and maps each +column to an encoder, in order to produce numeric features for machine learning +models. + +Parsing is handled internally by running a |Cleaner| on the input data. +Note that in this case numeric values are always converted to ``float32`` +(whereas the default |Cleaner| behavior is to keep the original datatype): this +is to ensure that the numeric dtype (including that of the missing values) is +consistent for the downstream methods. For most applications, ``float32`` has a +sufficient precision, and reduces the memory footprint of the resulting features. + +The same parameters used for the |Cleaner| can also be set when creating the +|TableVectorizer|: this includes parameters for |DropUninformative| +(``drop_null_fraction`` etc.), and a ``datetime_format`` parameter for the +datetime parsing step. + + +After detecting the datatypes, the |TableVectorizer| maps columns to one of +four groups depending either on the datatype, and the number of unique values +for categorical/string columns + +The default transformers used by the |TableVectorizer| for each column category +are the following: + +- **High-cardinality categorical columns**: |StringEncoder| +- **Low-cardinality categorical columns**: scikit-learn |OneHotEncoder| +- **Numeric columns**: "passthrough" (no transformation) +- **Datetime columns**: |DatetimeEncoder| + +**High cardinality** categorical columns are those with more than 40 unique values, +while all other categorical columns are considered **low cardinality**: the +threshold can be changed by setting the ``cardinality_threshold`` parameter of +|TableVectorizer|, or by changing the configuration parameter with the same name +using :func:`~skrub.set_config`. + +To change the encoder or alter default parameters, instantiate an encoder and pass +it to |TableVectorizer|. + +>>> from skrub import TableVectorizer, DatetimeEncoder, TextEncoder, SquashingScaler + +>>> datetime_enc = DatetimeEncoder(periodic_encoding="circular") +>>> text_enc = TextEncoder() +>>> num_enc = SquashingScaler() +>>> table_vec = TableVectorizer(datetime=datetime_enc, high_cardinality=text_enc, numeric=num_enc) +>>> table_vec +TableVectorizer(datetime=DatetimeEncoder(periodic_encoding='circular'), + high_cardinality=TextEncoder(), numeric=SquashingScaler()) + + +Besides the transformers provided by skrub, the |TableVectorizer| can also take +user-specified transformers that are applied to given columns. + +>>> from sklearn.preprocessing import OrdinalEncoder +>>> import pandas as pd +>>> encoder = OrdinalEncoder() +>>> df = pd.DataFrame({ +... "values": ["A", "B", "C"] +... }) + +We define the list of column-specific transformers: + +>>> specific_transformers=[(encoder, ["values"])] + +We can then encode the result: + +>>> TableVectorizer(specific_transformers=specific_transformers).fit_transform(df) + values +0 0.0 +1 1.0 +2 2.0 + +Note that the columns specified in ``specific_transformers`` are passed to the +transformer without any modification, which means that the transformer must be +able to handle the content of the column on its own. + +If you need to define complex transformers to pass to a single instance of +|TableVectorizer|, consider using the :ref:`skrub Data Ops `, +|ApplyToCols|, or the :ref:`skrub selectors ` instead, as +they are more versatile and allow a higher degree +of control over which operations are applied to which columns. + +The |TableVectorizer| is used in :ref:`example_encodings`, while the +docstring of the class provides more details on the parameters and usage, as well +as various examples. + +Numeric strings and categorical encoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, columns that contain only numeric strings (e.g. ``["1", "2", "3"]``) +are parsed as numeric features by the |TableVectorizer|. The recommended way to +treat such values as categorical (e.g. IDs or codes) is to convert the column +to pandas' ``category`` dtype using |ToCategorical| with |ApplyToCols| before +vectorizing, rather than relying on keeping them as strings. + +Default behavior: numeric strings are parsed as a single numeric column (feature +names are not one-hot encoded): + +>>> import pandas as pd +>>> from skrub import TableVectorizer +>>> df = pd.DataFrame({"c": ["1", "2", "3"]}) +>>> tv = TableVectorizer().fit(df) +>>> list(map(str, sorted(tv.get_feature_names_out()))) +['c'] +>>> tv = TableVectorizer() +>>> tv.fit_transform(df) +c +0 1.0 +1 2.0 +2 3.0 + +With |ToCategorical| and |ApplyToCols|, the column is treated as categorical +and produces one-hot encoded feature names: + +>>> from skrub import ApplyToCols, TableVectorizer, ToCategorical +>>> from sklearn.pipeline import make_pipeline +>>> pipe = make_pipeline( +... ApplyToCols(ToCategorical(), cols=["c"]), +... TableVectorizer(), +... ) +>>> pipe.fit(df) +Pipeline(steps=[('applytocols', ...), + ('tablevectorizer', ...)]) +>>> list(map(str, sorted(pipe.named_steps["tablevectorizer"].get_feature_names_out()))) +['c_1', 'c_2', 'c_3'] +>>> pipe.fit_transform(df) + c_1 c_2 c_3 +0 1.0 0.0 0.0 +1 0.0 1.0 0.0 +2 0.0 0.0 1.0 diff --git a/skrub/_docs/modules/default_wrangling/tabular_pipeline.rst b/skrub/_docs/modules/default_wrangling/tabular_pipeline.rst new file mode 100644 index 000000000..d2b3067eb --- /dev/null +++ b/skrub/_docs/modules/default_wrangling/tabular_pipeline.rst @@ -0,0 +1,146 @@ + +.. currentmodule:: skrub + +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` +.. |HistGradientBoostingRegressor| replace:: :class:`~sklearn.ensemble.HistGradientBoostingRegressor` +.. |HistGradientBoostingClassifier| replace:: :class:`~sklearn.ensemble.HistGradientBoostingClassifier` +.. |Pipeline| replace:: :class:`~sklearn.pipeline.Pipeline` +.. |SquashingScaler| replace:: :class:`~skrub.SquashingScaler` +.. |SimpleImputer| replace:: :class:`~sklearn.impute.SimpleImputer` +.. |ToCategorical| replace:: :class:`~skrub.ToCategorical` + +.. _user_guide_tabular_pipeline: + +Building robust ML baselines with |tabular_pipeline| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The |tabular_pipeline| is a function that, given a scikit-learn estimator, +returns a full scikit-learn |Pipeline| that contains a |TableVectorizer| +followed by the given estimator. +If the estimator is a linear model (e.g., ``Ridge``, ``LogisticRegression``), +|tabular_pipeline| adds a |SquashingScaler| and a |SimpleImputer| to the pipeline. + +>>> from sklearn.linear_model import LinearRegression +>>> from skrub import tabular_pipeline +>>> tabular_pipeline(LinearRegression()) +Pipeline(steps=[('tablevectorizer', + TableVectorizer(datetime=DatetimeEncoder(periodic_encoding='spline'))), + ('simpleimputer', SimpleImputer(add_indicator=True)), + ('squashingscaler', SquashingScaler(max_absolute_value=5)), + ('linearregression', LinearRegression())]) + +It is also possible to call the function with the name of the task that must be +performed (``regression``/``regressor``, ``classification``/``classifier``) to +build a pipeline that uses a +|HistGradientBoostingRegressor|/|HistGradientBoostingClassifier|. + +>>> from skrub import tabular_pipeline +>>> tabular_pipeline("regression") +Pipeline(steps=[('tablevectorizer', + TableVectorizer(...), + ('histgradientboostingregressor', + HistGradientBoostingRegressor(...))]) + +The pipeline prepared by |tabular_pipeline| is a strong first baseline for most +problems, but may not beat properly tuned ad-hoc pipelines. + +.. list-table:: Parameter values choice of :class:`TableVectorizer` when using the :func:`tabular_pipeline` function + :header-rows: 1 + :widths: 25 25 25 25 + + * - Parameter + - ``RandomForest`` models + - ``HistGradientBoosting`` models + - Linear models and others + * - Low-cardinality encoder + - :class:`~sklearn.preprocessing.OrdinalEncoder` + - Native support + - :class:`~sklearn.preprocessing.OneHotEncoder` + * - High-cardinality encoder + - :class:`StringEncoder` + - :class:`StringEncoder` + - :class:`StringEncoder` + * - Numeric preprocessor + - No processing + - No processing + - :class:`~skrub.SquashingScaler` + * - Date preprocessor + - :class:`DatetimeEncoder` + - :class:`DatetimeEncoder` + - :class:`DatetimeEncoder` with spline encoding + * - Missing value strategy + - Native support + - Native support + - :class:`~sklearn.impute.SimpleImputer` + + +The logic used by the tabular pipeline is quite simple +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The logic that is used by the |tabular_pipeline| is in fact quite simple, so +users do not lose much if they decide to write their own pipeline instead. +In practice it does only three things: + +- It chooses a |TableVectorizer| configuration from the estimator type. For + example, linear models get spline datetime features, while histogram gradient + boosting models with ``categorical_features="from_dtype"`` get + ``low_cardinality=ToCategorical()``. +- It inserts a |SimpleImputer| when the estimator cannot handle missing values. +- It inserts a |SquashingScaler| for estimators that benefit from scaling, and + skips it for tree ensembles. + +If your use case needs more control, writing the full pipeline yourself is +usually straightforward and gives you access to the exact same building blocks. +See the source of :func:`~skrub.tabular_pipeline` for the exact logic. + +Extending the pipeline with the ``.steps`` attribute +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can use the ``.steps`` attribute of the resulting pipeline together with +:func:`~sklearn.pipeline.make_pipeline` to build a new pipeline that has more +than just the steps in the tabular pipeline. The ``steps`` attribute of a +scikit-learn |Pipeline| is a list of ``(name, estimator)`` pairs, so we can +extract the estimators from the preprocessing steps and pass them to +``make_pipeline`` while inserting an extra transformation before the final +estimator: + +>>> from sklearn.feature_selection import SelectPercentile, f_regression +>>> from sklearn.pipeline import make_pipeline +>>> from skrub import tabular_pipeline +>>> base_pipeline = tabular_pipeline("regressor") +>>> extended_pipeline = make_pipeline( +... *[step[1] for step in base_pipeline.steps[:-1]], +... SelectPercentile(score_func=f_regression, percentile=50), +... base_pipeline.steps[-1][1], +... ) +>>> [name for name, _ in extended_pipeline.steps] +['tablevectorizer', 'selectpercentile', 'histgradientboostingregressor'] + +Here ``[step[1] for step in base_pipeline.steps[:-1]]`` extracts the +estimators from all preprocessing steps, while omitting the final estimator. +Those preprocessing estimators are unpacked into ``make_pipeline``, then a +supervised feature-selection step and the original estimator are appended. This +pattern is useful whenever you want to add something such as feature selection, +dimensionality reduction, or calibration without rewriting the whole pipeline +from scratch. + +Using a pipeline as the estimator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The estimator passed to |tabular_pipeline| can itself be a |Pipeline|. This is +often the simplest way to add estimator-specific postprocessing while keeping +the default table preprocessing: + +>>> from sklearn.decomposition import PCA +>>> from sklearn.linear_model import Ridge +>>> from sklearn.pipeline import make_pipeline +>>> from skrub import tabular_pipeline +>>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge()) +>>> full_pipeline = tabular_pipeline(model_pipeline) +>>> [name for name, _ in full_pipeline.steps] +['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pipeline'] + +The user-provided estimator pipeline is appended as a single final step. This +means that ``tabular_pipeline`` can still decide which preprocessing steps to +add before your own estimator logic. diff --git a/skrub/_docs/modules/joining_tables/assembling.rst b/skrub/_docs/modules/joining_tables/assembling.rst new file mode 100644 index 000000000..51601324c --- /dev/null +++ b/skrub/_docs/modules/joining_tables/assembling.rst @@ -0,0 +1,61 @@ +.. currentmodule:: skrub + +Assembling: joining multiple tables +=================================== + +Assembling is the process of collecting and joining together tables. Good analytics +requires including as much information as possible, often from different sources. + +Skrub allows you to join tables on keys of different types (string, numerical, +datetime) with imprecise correspondence. + + + +Joining external tables for machine learning +-------------------------------------------- + +Joining is straightforward for two tables because you only need to identify +the common key. + +In addition, skrub also enable more advanced analysis: + +- :class:`Joiner`: fuzzy-joins an external table using a scikit-learn + transformer, which can be used in a scikit-learn :class:`~sklearn.pipeline.Pipeline`. + Pipelines are useful for cross-validation and hyper-parameter search, but also + for model deployment. + +- :class:`AggJoiner`: instead of performing 1:1 joins like :class:`Joiner`, + :class:`AggJoiner` + aggregates the external table first, then joins it on the main table. + Alternatively, it can aggregate the main table and then join it back onto itself. + +- :class:`AggTarget`: in some settings, one can derive powerful features from + the target ``y`` itself. AggTarget aggregates the target without risking data + leakage, then joins the result back on the main table, similar to AggJoiner. + +- :class:`MultiAggJoiner`: extension of the :class:`AggJoiner` that joins multiple + auxiliary tables onto the main table. + +Fuzzy joining tables +--------------------- + +Joining two dataframes can be hard as the corresponding keys may be different. + +:func:`~skrub.fuzzy_join` uses similarities in entries to join tables on one or more +related columns. Furthermore, it chooses the type of fuzzy matching used based +on the column type (string, numeric or datetime). It also outputs a similarity +score, to single out bad matches, so that they can be dropped or replaced. + +In sum, equivalent to :func:`pandas.merge`, the :func:`fuzzy_join` +has no need for pre-cleaning. + + +Using the :class:`InterpolationJoiner` to join tables using ML predictions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The :class:`InterpolationJoiner` is a transformer that performs an operation similar +to that of a regular equi-join, but that can handle the presence of missing rows +in the right table (the table to be added). This is done by estimating the value +that the missing rows would have by training a machine learning model on the data +we have access to. + +This transformer is explored in more detail in :ref:`this example `. diff --git a/skrub/_docs/modules/multi_column_operations/advanced_selectors.rst b/skrub/_docs/modules/multi_column_operations/advanced_selectors.rst new file mode 100644 index 000000000..c14ddf716 --- /dev/null +++ b/skrub/_docs/modules/multi_column_operations/advanced_selectors.rst @@ -0,0 +1,126 @@ +.. currentmodule:: skrub.selectors + +.. |StandardScaler| replace:: :class:`~sklearn.preprocessing.StandardScaler` +.. |filter| replace:: :func:`filter` +.. |filter_names| replace:: :func:`filter_names` + +.. _user_guide_advanced_selectors: + +|filter| and |filter_names| to select with user-defined criteria +----------------------------------------------------------------- + +:func:`filter` and :func:`filter_names` allow +selecting columns based on arbitrary user-defined criteria. These are also used to +implement many of the other selectors provided in this module. + +:func:`filter` accepts a function which will be called on a column +(i.e., a Pandas or polars Series). This function, called a predicate, must return +``True`` if the column should be selected. + +>>> import pandas as pd +>>> import skrub.selectors as s +>>> df = pd.DataFrame( +... { +... "height_mm": [297.0, 420.0], +... "width_mm": [210.0, 297.0], +... "kind": ["A4", "A3"], +... "ID": [4, 3], +... } +... ) +>>> s.select(df, s.filter(lambda col: "A4" in col.tolist())) + kind +0 A4 +1 A3 + +:func:`filter_names` accepts a predicate that is passed the column name, +instead of the column. + +>>> s.select(df, s.filter_names(lambda name: name.endswith('mm'))) + height_mm width_mm +0 297.0 210.0 +1 420.0 297.0 + +We can pass args and kwargs that will be forwarded to the predicate, to help avoid +lambda or local functions and thus ensure the selector is picklable. + +>>> s.select(df, s.filter_names(str.endswith, 'mm')) + height_mm width_mm +0 297.0 210.0 +1 420.0 297.0 + + +Example of custom criteria in :func:`filter`: selecting columns with outliers +............................................................................. + +The :func:`filter` selector can be used to select columns based on custom +criteria. For example, we can define a function that checks if a column contains +outliers using the Interquartile Range (IQR) method, and then use this function +with :func:`filter` to select such columns. + +Specifically, we define a function that computes the IQR (Inter Quartile Range) of a column +and checks if any data points extend further than 2 IQRs of the lower and upper quartile. + +>>> def has_outliers(column): +... q1 = column.quantile(0.25) +... q3 = column.quantile(0.75) +... IQR = q3 - q1 +... lower_bound = q1 - 2 * IQR +... upper_bound = q3 + 2 * IQR +... outliers = (column < lower_bound) | (column > upper_bound) +... return any(outliers) + +>>> from skrub import SelectCols +>>> select = SelectCols(s.filter(has_outliers)) +>>> data = pd.DataFrame({ +... "A": [10, 12, 14, 15, 100], # Outlier in column A +... "B": [20, 22, 21, 19, 20], # No outliers in column B +... "C": [30, 29, 31, 32, 300] # Outlier in column C +... }) +>>> select.fit_transform(data) + A C +0 10 30 +1 12 29 +2 14 31 +3 15 32 +4 100 300 + + +Select columns with null values +-------------------------------- +Selectors :func:`has_nulls` and :ref:`user_guide_drop_uninformative` can be used to get information +about columns with null values. The selector :func:`has_nulls` selects columns that contain +null values and it accepts an optional ``proportion`` parameter that allows **selecting** columns +based on the proportion of null values they contain. + +Example: Selecting columns by null percentage with :func:`has_nulls` +..................................................................... + +The :func:`has_nulls` selector can filter columns based on their proportion of missing values. +This is useful for identifying columns that may need imputation or further investigation. + +>>> import pandas as pd +>>> import skrub.selectors as s +>>> from skrub import SelectCols + +Create a dataset with varying amounts of missing data: + +>>> df = pd.DataFrame({ +... 'patient_id': [1, 2, 3, 4, 5, 6, 7, 8], +... 'age': [25.0, 30.0, None, 45.0, 50.0, None, 60.0, 65.0], # 25% nulls +... 'blood_pressure': [120, None, None, None, 140, None, None, 150], # 62.5% nulls +... 'diagnosis': ['flu', 'cold', None, None, None, None, None, None], # 75% nulls +... 'treatment': ['med_A', 'med_B', 'med_C', 'med_D', 'med_E', 'med_F', 'med_G', 'med_H'] # no nulls +... }) + +Select columns with at least 25% missing values: + +>>> s.select(df, s.has_nulls(proportion=0.25)) + blood_pressure diagnosis +0 120.0 flu +1 NaN cold +2 NaN ... +3 NaN ... +4 140.0 ... +5 NaN ... +6 NaN ... +7 150.0 ... diff --git a/skrub/_docs/modules/multi_column_operations/drop_uninformative.rst b/skrub/_docs/modules/multi_column_operations/drop_uninformative.rst new file mode 100644 index 000000000..2dd5555d9 --- /dev/null +++ b/skrub/_docs/modules/multi_column_operations/drop_uninformative.rst @@ -0,0 +1,107 @@ +.. |DropUninformative| replace:: :class:`~skrub.DropUninformative` +.. |ApplyToCols| replace:: :class:`~skrub.ApplyToCols` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` + +.. _user_guide_drop_uninformative: + +Removing unneeded columns with |DropUninformative| and |Cleaner| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Data tables often include columns that do not provide meaningful information. +These columns increase computational cost and may reduce downstream performance. + +The |DropUninformative| transformer removes features that are deemed "uninformative" +using various heuristics. These heuristics include: + +- **Dropping columns with excessive missing values**: Columns are dropped if the + fraction of missing values exceeds the specified threshold. By default, the + threshold is 1, meaning only columns with all missing values are dropped. Adjust + this behavior by setting the ``drop_null_fraction`` parameter. Setting it to + ``None`` disables this check entirely. + +- **Dropping constant columns**: Columns containing only a single unique value are + removed. This behavior is controlled by the ``drop_if_constant`` parameter, which + is set to ``False`` by default. Note that missing values are treated as distinct + values, so constant columns with missing values will not be dropped. + +|DropUninformative| is used by both |TableVectorizer| and |Cleaner|, and both +accept the same parameters for dropping columns. + +Consider the following example: + +>>> import numpy as np +>>> import pandas as pd +>>> from skrub import Cleaner +>>> data = { +... 'Const int': [1, 1, 1], # Single unique value +... 'B': [2, 3, 2], # Multiple unique values +... 'Const str': ['x', 'x', 'x'], # Single unique value +... 'D': [4, 5, 6], # Multiple unique values +... 'All nan': [np.nan, np.nan, np.nan], # All missing values +... 'All empty': ['', '', ''], # All empty strings +... } +>>> df = pd.DataFrame(data) +>>> df + Const int B Const str D All nan All empty +0 1 2 x 4 NaN +1 1 3 x 5 NaN +2 1 2 x 6 NaN + +To drop constant columns and those with only single values: + +>>> cleaner = Cleaner(drop_if_constant=True) +>>> df_cleaned = cleaner.fit_transform(df) +>>> df_cleaned + B D +0 2 4 +1 3 5 +2 2 6 + +| +Dropping columns with many missing values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Columns with too many missing values may not provide useful information for +downstream models. The ``drop_null_fraction`` parameter allows dropping such +columns when the proportion of missing values exceeds a specified threshold. + +Consider the following dataset: + +>>> import pandas as pd +>>> from skrub import DropUninformative, ApplyToCols + +>>> df = pd.DataFrame({ +... 'patient_id': [1, 2, 3, 4, 5, 6, 7, 8], +... 'age': [25.0, 30.0, None, 45.0, 50.0, None, 60.0, 65.0], +... 'blood_pressure': [120, None, None, None, 140, None, None, 150], +... 'diagnosis': ['flu', 'cold', None, None, None, None, None, None], +... 'treatment': ['med_A', 'med_B', 'med_C', 'med_D', 'med_E', 'med_F', 'med_G', 'med_H'] +... }) + +Applying |DropUninformative| only to a subset of columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can apply the |DropUninformative| transformer to specific columns using +|ApplyToCols| and the skrub selectors. In this case, we want to drop columns with +more than 50% missing values, but only if they have ``string`` type: + +>>> import skrub.selectors as s +>>> cleaner = ApplyToCols(DropUninformative(drop_null_fraction=0.5), cols=s.string()) +>>> cleaned_df = cleaner.fit_transform(df) +>>> cleaned_df + patient_id age blood_pressure treatment +0 1 25.0 120.0 med_A +1 2 30.0 NaN med_B +2 3 NaN NaN med_C +3 4 45.0 NaN med_D +4 5 50.0 140.0 med_E +5 6 NaN NaN med_F +6 7 60.0 NaN med_G +7 8 65.0 150.0 med_H + + +You can apply the |DropUninformative| transformer to specific columns using +For more advanced filtering operations, refer to the User Guide on +:ref:`user_guide_selectors` and the |ApplyToCols| documentation for details +on applying transformers to specific columns. diff --git a/skrub/_docs/modules/multi_column_operations/selectors.rst b/skrub/_docs/modules/multi_column_operations/selectors.rst new file mode 100644 index 000000000..1e5004d8d --- /dev/null +++ b/skrub/_docs/modules/multi_column_operations/selectors.rst @@ -0,0 +1,237 @@ +.. _user_guide_selectors: + +Skrub Selectors, for selecting columns in a dataframe +===================================================== + +In skrub, a selector represents a column selection rule, such as "all columns +that have numeric data types, except the column ``'User ID'``". + +Selectors have two main benefits: + +- Expressing complex selection rules in a simple and concise way by combining + selectors with operators. A range of useful selectors is provided by this module. +- Delayed selection: passing a selection rule which will evaluated later on a dataframe + that is not yet available. For example, without selectors, it is not possible to + instantiate a :class:`~skrub.SelectCols` that selects all columns except those with + the suffix 'ID' if the data on which it will be fitted is not yet available. + +Introduction to selectors +------------------------------ + +Here is an example dataframe. Note that selectors support both Pandas and Polars +dataframes:: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "height_mm": [297.0, 420.0], + ... "width_mm": [210.0, 297.0], + ... "kind": ["A4", "A3"], + ... "ID": [4, 3], + ... } + ... ) + +:func:`~skrub.selectors.cols` is a simple kind of selector which selects a fixed list of +column names:: + + >>> from skrub import selectors as s + >>> mm_cols = s.cols('height_mm', 'width_mm') + >>> mm_cols + cols('height_mm', 'width_mm') + +Using selectors: + +* **select function**: the above selector can be passed to the :func:`~skrub.selectors.select` function:: + + >>> s.select(df, mm_cols) + height_mm width_mm + 0 297.0 210.0 + 1 420.0 297.0 + +* **transformers**: various transformers in skrub use selectors to select and transform columns + in a scikit-learn pipeline: :class:`~skrub.ApplyToCols`, + :class:`~skrub.DropCols`, :class:`~skrub.SelectCols`, as + :ref:`detailed below `. + +* **DataOps** selectors can be passed to + :ref:`skrub DataOps ` when applying an + estimator with the :func:`skrub.DataOp.skb.apply` function:: + + >>> import skrub + >>> from sklearn.preprocessing import StandardScaler + >>> skrub.X(df).skb.apply(StandardScaler(), cols=mm_cols) + + Result: + ――――――― + kind ID height_mm width_mm + 0 A4 4 -1.0 -1.0 + 1 A3 3 1.0 1.0 + +Type of selectors +----------------- + +:func:`~skrub.selectors.all` is another simple selector, especially useful for default +arguments since it keeps all columns:: + + >>> from skrub import SelectCols + >>> SelectCols(cols=s.all()).fit_transform(df) + height_mm width_mm kind ID + 0 297.0 210.0 A4 4 + 1 420.0 297.0 A3 3 + +Selectors can be combined with operators, for example if we wanted all columns +except the "mm" columns above:: + + >>> SelectCols(s.all() - s.cols("height_mm", "width_mm")).fit_transform(df) + kind ID + 0 A4 4 + 1 A3 3 + +This module provides several kinds of selectors, which allow to select columns by +name, data type, contents, or according to arbitrary user-provided rules:: + + >>> SelectCols(s.numeric()).fit_transform(df) + height_mm width_mm ID + 0 297.0 210.0 4 + 1 420.0 297.0 3 + + >>> SelectCols(s.glob('*_mm')).fit_transform(df) + height_mm width_mm + 0 297.0 210.0 + 1 420.0 297.0 + +.. seealso:: + + * :ref:`selectors_details` explains more the various selectors + + * :ref:`selectors_ref` gives the exhaustive list of selectors + + * :ref:`user_guide_advanced_selectors` + +Selectors can be combined with the set operators +------------------------------------------------ + +The available operators are ``|``, ``&``, ``-``, ``^`` with the meaning of usual +python sets, and ``~`` to invert a selection: + +>>> SelectCols(s.glob('*_mm')).fit_transform(df) +height_mm width_mm +0 297.0 210.0 +1 420.0 297.0 + +>>> SelectCols(~s.glob('*_mm')).fit_transform(df) +kind ID +0 A4 4 +1 A3 3 + +>>> SelectCols(s.glob('*_mm') | s.cols('ID')).fit_transform(df) +height_mm width_mm ID +0 297.0 210.0 4 +1 420.0 297.0 3 + +>>> SelectCols(s.glob('*_mm') & s.glob('height_*')).fit_transform(df) +height_mm +0 297.0 +1 420.0 + +>>> SelectCols(s.glob('*_mm') ^ s.string()).fit_transform(df) +height_mm width_mm kind +0 297.0 210.0 A4 +1 420.0 297.0 A3 + +The operators respect the usual short-circuit rules. For example, the +following selector won't compute the cardinality of non-categorical columns: + +>>> s.categorical() & s.cardinality_below(10) +(categorical() & cardinality_below(10)) + +.. _user_guide_selectors_expand: +Using selectors with dataframe libraries +---------------------------------------- + +All selectors have the :meth:`expand` method, which allows dataframe manipulation +outside of a skrub workflow: applying it to any dataframe will return the list +of column names from the dataframe that the selector would keep. This allows selectors +to be applied on a variety of standard dataframe libraries, and can be particularly +useful on complicated combinations of selectors. For instance, the following filter +only keeps columns that do not end in ``_mm``: + +>>> some_selector = ~s.glob("*_mm") +>>> import pandas as pd +>>> df = pd.DataFrame( +... { +... "height_mm": [210.0, 297.0], +... "width_mm": [188.5, 210.0], +... "kind": ["A5", "A4"], +... "ID": [5, 4], +... } +... ) +>>> some_selector.expand(df) +['kind', 'ID'] + + +The :meth:`expand_index` method also exists: rather than returning a list of column names, it returns the corresponding indices from the input dataframe's column list: + +>>> some_selector.expand_index(df) +[2, 3] + +.. _selectors_and_transformer: + +Using selectors with other skrub transformers +------------------------------------------------- + +Skrub selectors are designed to be used in conjunction with :class:`~skrub.ApplyToCols`, +:class:`skrub.SelectCols`, and :class:`skrub.DropCols`, as well as +:func:`~skrub.DataOp.skb.apply` to improve their versatility in how they modify +columns. + +For example, it is possible to drop columns that have more unique values than a +certain amount by combining :func:`~skrub.selectors.cardinality_below` with +:class:`skrub.DropCols`. +To do so, a selector targeting columns that have more than 3 unique values +is defined, and its inverse is used as a parameter for :class:`skrub.DropCols`: + +>>> df = pd.DataFrame({ +... "not a lot": [1, 1, 1, 2, 2], +... "too_many": [1, 2, 3, 4, 5]}) + +>>> from skrub import DropCols +>>> DropCols(cols=~s.cardinality_below(3)).fit_transform(df) + not a lot +0 1 +1 1 +2 1 +3 2 +4 2 + +Selectors can be used in conjunction with :class:`~skrub.ApplyToCols` to transform columns +based on specific requirements. + +Consider the following example: + +>>> import pandas as pd +>>> data = { +... "subject": ["Math", "English", "History", "Science", "Art"], +... "grade": [5, 4, 3, 4, 3] +... } +>>> df = pd.DataFrame(data) +>>> df + subject grade +0 Math 5 +1 English 4 +2 History 3 +3 Science 4 +4 Art 3 + +We might want to apply the :class:`~sklearn.preprocessing.StandardScaler` only to the numeric column. We can +do this like this: + +>>> from skrub import ApplyToCols +>>> from sklearn.preprocessing import StandardScaler +>>> ApplyToCols(StandardScaler(), cols=s.numeric()).fit_transform(df) + subject grade +0 Math 1.603567 +1 English 0.267261 +2 History -1.069045 +3 Science 0.267261 +4 Art -1.069045 diff --git a/skrub/_docs/modules/multi_column_operations/type_of_selectors.rst b/skrub/_docs/modules/multi_column_operations/type_of_selectors.rst new file mode 100644 index 000000000..87d598187 --- /dev/null +++ b/skrub/_docs/modules/multi_column_operations/type_of_selectors.rst @@ -0,0 +1,98 @@ +.. _selectors_details: + +Selecting based on dtype or data properties +------------------------------------------- + +Selectors can filter columns based on different conditions. + +:func:`~skrub.selectors.all` is a simple selector, especially useful for default +arguments since it keeps all columns: + +>>> import pandas as pd +>>> from skrub import SelectCols +>>> import skrub.selectors as s +>>> df = pd.DataFrame( +... { +... "height_mm": [297.0, 420.0], +... "width_mm": [210.0, 297.0], +... "kind": ["A4", "A3"], +... "ID": [4, 3], +... } +... ) +>>> SelectCols(cols=s.all()).fit_transform(df) + height_mm width_mm kind ID +0 297.0 210.0 A4 4 +1 420.0 297.0 A3 3 + +Selectors can be combined with operators, for example if we wanted all columns +except the "mm" columns above: + +>>> SelectCols(s.all() - s.cols("height_mm", "width_mm")).fit_transform(df) + kind ID +0 A4 4 +1 A3 3 + +This module provides several kinds of selectors, which allow to select columns by +name, data type, contents, or according to arbitrary user-provided rules. + +>>> SelectCols(s.numeric()).fit_transform(df) + height_mm width_mm ID +0 297.0 210.0 4 +1 420.0 297.0 3 + +Selectors can be inverted with ``~``, or :func:`~skrub.selectors.inv`: + +>>> SelectCols(~s.numeric()).fit_transform(df) + kind +0 A4 +1 A3 + +>>> SelectCols(s.inv(s.numeric())).fit_transform(df) + kind +0 A4 +1 A3 + + +Selectors can work on the column names. For example, to select the columns that +end with ``_mm`` we can do: + +>>> SelectCols(s.glob('*_mm')).fit_transform(df) + height_mm width_mm +0 297.0 210.0 +1 420.0 297.0 + +| + +Categories of selectors +----------------------- + +The selectors in this module can be categorized based on what aspect of the columns +they examine: + +Selectors based on column data types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :func:`~skrub.selectors.numeric`: Select columns with numeric data types (float and integer) +- :func:`~skrub.selectors.integer`: Select columns with integer data types +- :func:`~skrub.selectors.float`: Select columns with floating-point data types +- :func:`~skrub.selectors.has_dtype`: Select columns whose dtype exactly matches one of the provided dtypes +- :func:`~skrub.selectors.any_date`: Select columns with date or datetime data types +- :func:`~skrub.selectors.categorical`: Select columns with categorical data types +- :func:`~skrub.selectors.string`: Select columns with string data types +- :func:`~skrub.selectors.boolean`: Select columns with boolean data types + +Selectors based on column content and properties +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :func:`~skrub.selectors.cardinality_below`: Select columns with fewer unique + values than a threshold +- :func:`~skrub.selectors.has_nulls`: Select columns that contain at least one + null value + +Selectors based on column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :func:`~skrub.selectors.cols`: Select columns explicitly by name +- :func:`~skrub.selectors.glob`: Select columns by name using Unix shell-style + pattern matching +- :func:`~skrub.selectors.regex`: Select columns by name using regular expressions diff --git a/skrub/_docs/modules/tablereport/exploring_dataframes_interactively.rst b/skrub/_docs/modules/tablereport/exploring_dataframes_interactively.rst new file mode 100644 index 000000000..96af04aed --- /dev/null +++ b/skrub/_docs/modules/tablereport/exploring_dataframes_interactively.rst @@ -0,0 +1,62 @@ +.. |TableReport| replace:: :class:`~skrub.TableReport` +.. |set_config| replace:: :func:`~skrub.set_config` +.. |column_associations| replace:: :func:`~skrub.column_associations` + +.. _user_guide_table_report_start: + +Exploring dataframes interactively with the |TableReport| +========================================================= + +The |TableReport| gives a high-level overview of a Dataframe or Series, suitable for +quick exploratory analysis. The report shows the first +and last 5 rows of the dataframe (decided by the ``n_rows`` parameter), as well +as additional information in other tabs. + +- The **Stats** tab reports high-level statistics for each column. +- The **Distribution** tab collects summary plots for each column (max 30 by default). +- The **Associations** tab shows `Cramer V `_ + and `Pearson correlation `_ + between columns. +- Built-in filters allow selection of columns by dtype and other conditions. + +The |TableReport| of a table can be generated as follows: + +>>> from skrub import TableReport +>>> import pandas as pd +>>> df = pd.DataFrame({ +... "id": [1, 2, 3], +... "value": [10, 20, 30], +... }) +>>> TableReport(df) # from a notebook cell + + +The command ``TableReport(df).open()`` opens the report in a browser window. + +It is also possible to export the |TableReport| in JSON or Markdown format with +:meth:`~skrub.TableReport.json()` :meth:`~skrub.TableReport.markdown()` respectively. + +The generated JSON includes the plots in SVG format, which can be +quite verbose: plots can be disabled by setting ``plot_distributions=False`` +when generating the report. +Similarly, the Markdown string includes information about all columns in the dataframe, +so it can be quite lengthy for dataframes that include many columns. + +.. warning:: + + The Markdown output can be fed to AI agents to obtain insight in the data, + but it is **not** sanitized by the |TableReport|. Therefore, it should not be + used with untrusted data or for dataframes that are too large, as it could lead + to security risks or performance issues. + +A demo of the |TableReport| +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pre-computed examples of the |TableReport| are available +`here `_, and you can +try it out on your data `here `_. + +In the **Distributions** tab, it is possible to select columns by clicking on the +checkmark icon: the name of the column is added to the bar on top, so that it may +be copied in a script. + +The TableReport can be used in a notebook cell, or it can be opened in a browser +window using ``TableReport(df).open()``. diff --git a/skrub/_docs/multi_column_operations.rst b/skrub/_docs/multi_column_operations.rst new file mode 100644 index 000000000..fc5e5e7a0 --- /dev/null +++ b/skrub/_docs/multi_column_operations.rst @@ -0,0 +1,17 @@ +.. _user_guide_multi_column_index: + +Multi-column operations +======================== + +Skrub provides various tools to extend the use of single column transformers to +multiple columns. + +.. include:: includes/big_toc_css.rst + +.. toctree:: + :maxdepth: 3 + + modules/multi_column_operations/drop_uninformative + modules/multi_column_operations/selectors + modules/multi_column_operations/type_of_selectors + modules/multi_column_operations/advanced_selectors diff --git a/skrub/_docs/tutorial_example.rst b/skrub/_docs/tutorial_example.rst new file mode 100644 index 000000000..40189a867 --- /dev/null +++ b/skrub/_docs/tutorial_example.rst @@ -0,0 +1,239 @@ +.. _tutorial_write_example: + +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` + +How to write an example for the gallery +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This tutorial explains to new contributors how to format their examples so that +they are properly rendered in the skrub documentation gallery. + +While examples are written in plain Python code, there are some quirks to be aware of +when writing them, due to the way Sphinx and the sphinx-gallery extension work. +This tutorial explains these quirks and how to work around them. + +Location of the examples +----------------------- + +Once you decide on the subject of your example, start writing the code as a Python +script. Place the script in the ``examples/`` folder of the repository. The example +should be self-contained and runnable as a standalone script. The documentation is +built by executing the code and generating additional content from it. + +The name of the file should start with a number, followed by an underscore, +and then a short description of the example. The number is used to order the examples +in the documentation. For instance, if your example is about using the +|TableVectorizer| class, you might want to name the file ``01_table_vectorizer.py``. + +Note that the ``examples/`` folder is covered by ``pre-commit`` hooks, which run +various checks on your code when you try to commit. These checks may block you from +pushing. + +Dealing with typos in the example +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If your code includes any kind of intentional typo, for example if you are trying +to correct names by replacing a string with a typo with the new one, the +``codespell`` hook will block your commit. To bypass this, update ``pyproject.toml`` +by adding the typo to the ``ignore-word-list`` entry in the ``tool.codespell`` +section. After this, commit the updated ``pyproject.toml`` file using +``git commit --no-verify`` to bypass local checks so that following commits will +ignore the typos. +Note that without updating ``pyproject.toml``, the CI will still reject commits +with typos, as it runs the same hooks that are run locally. + +Writing the example +----------------------- +Your python script should start with a docstring that briefly explains what the example +is about. This docstring can contain multiple paragraphs and will be rendered +as an RST file in the documentation, so you can use RST syntax +in it. + +Importantly, the first line of the docstring should be the title of the example, +not an RST directive (such as ``.. replace::`` or ``.. note::``). Sphinx +adds a reference to the example at the top of the page using the file name as the +title. Adding a directive at the top of the docstring would prevent proper HTML +rendering. + +This is an example of what the beginning of your example may look like: + +.. code-block:: python + + """ + Title of the example + ==================== + + This is a brief description of the example. It can contain multiple paragraphs, + and it can use RST syntax. + + .. note:: + + You can use RST directives in the docstring, such as ``.. note::``, + ``.. warning::``, ``.. seealso::``, etc. + + After the definition of the title, you may also add directives such as + ``.. replace::``, and they will be rendered properly. For example, you can add: + + .. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` + + + """ + +Then, you can start writing the code for the example. The content of your Python script +should be a sequence of code cells, each delimited by a line starting with ``# %%``. +These code cells may contain comments, which will be rendered as rst in the final +documentation. + +After the docstring, write the code for your example as a sequence of code cells, +each delimited by a line starting with ``# %%``. Comments in these cells will be +rendered as RST in the final documentation. + +.. code-block:: python + + # %% + # This is a comment that will be rendered as markdown in the final documentation. + # You can use multiple lines for comments, and you can use RST syntax in them. + + import pandas as pd + from skrub import TableVectorizer + + # %% + # This is another code cell. You can write any python code here. + df = pd.DataFrame({ + "A": [1, 2, 3], + "B": ["a", "b", "c"] + }) + tv = TableVectorizer() + X = tv.fit_transform(df) + print(X) + +Running the example +------------------- + +Once you have written the code for the example (or while writing it), you can run +it to see how it looks in the final documentation. Depending on your setup, you +may need to install some dependencies. Refer to your IDE's documentation for more +information on running interactive Python scripts. For example, VSCode documentation +is available `here `_. + +Once you are happy with your example, you can submit a pull request to the repository, +following the instructions in the :ref:`contributing guide `. + +Adding cross-references +----------------------- + +Adding cross-references to the documentation helps users find more information +about the concepts and functions used in your example. This step is optional, and +you may ask the maintainers for help on which cross-references to add. Good +cross-references include relevant user guide sections, the documentation of the +objects used in the example (like the |TableVectorizer|), or other examples. + +You can add cross-references in the docstring and comments of your example in several ways: + +- You can add references to the objects in the skrub API using the ``:class:`~skrub.ClassName``` + or ``:func:`~skrub.function_name``` directives. +- If your example uses the same objects multiple times, you can define a replacement at the top + of the docstring using the ``.. replace::`` directive, and then use the replacement + instead of the full directive. +- You can also add references to other sections of the documentation using the + ``:ref:`label``` directive, where ``label`` is the label of the section you want to reference. + + +For example, if your example uses the |TableVectorizer| class multiple times, define +a replacement at the top of the docstring. You may also want to add a reference +to the user guide section about the |TableVectorizer| class. This can be done as follows: + +.. code-block:: python + + """ + Title of the example + ==================== + + .. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` + + This example demonstrates how to use the |TableVectorizer| class to vectorize a dataframe. + + See the :ref:`user_guide_building_pipeline_index` guide for more information about the |TableVectorizer| class. + """ + + # %% + import pandas as pd + from skrub import TableVectorizer + + df = pd.DataFrame({ + "A": [1, 2, 3], + "B": ["a", "b", "c"] + }) + tv = TableVectorizer() + X = tv.fit_transform(df) + print(X) + +You may find more information on the cross-references in the +`official Sphinx documentation `_. + + +Generating the new documentation +------------------------------- +Once you have written your example and added any necessary cross-references, you can +generate the new documentation to see how it looks. This can be done in two ways: + +- You can run the commands ``make html`` or ``make html-noplot`` in the ``doc/`` + folder of the repository to generate the HTML documentation for the entire project. +- Alternatively, you can use ``pixi run -e doc build-doc`` or ``pixi run -e doc build-doc-quick`` + from the root folder to generate the documentation. The advantage of using ``pixi`` is that + it automatically sets up a virtual environment with the necessary dependencies, so you + don't need to worry about installing them manually. + +The ``make html`` and ``pixi run -e doc build-doc`` commands generate complete +documentation by executing all example code. The ``-noplot`` (or ``-quick``) +versions skip code execution, making documentation generation much faster. Use +these faster versions to check formatting when you've already tested your example +code locally. + +The CI pipeline will always run the full documentation build, so you can safely +use ``make html-noplot`` or ``pixi run -e doc build-doc-quick`` for local testing. + + +After generating the documentation, open the ``index.html`` file in the ``doc/_build/html/`` +folder with a web browser to review the results. Check that: + +- Section titles are properly formatted. +- Any formatting in docstrings or comments is rendered as intended. For example, + Sphinx uses spaces to delimit lists and code blocks, so if you have them in the + example, make sure that they render correctly. +- Cross-references are working. You can check the logs of the Sphinx + generation to see if there are any broken references. + + +Linking your work to examples already in the documentation +---------------------------------------------------------- +After generating the documentation, you may want to add references to your example +in other relevant parts of the documentation. This helps users find your example +when reading about related topics. + + +This step is done after generating the documentation because you need the final +reference name, which is created dynamically from your file name. For example, +if your file is named ``99_my_example.py``: + +1. The generated files will be in ``doc/auto_examples`` +2. A reference file will be created at ``doc/auto_examples/99_my_example.rst`` +3. The reference label will be ``.. _sphx_glr_auto_examples_99_my_example.py`` + +To link to your example from other documentation pages, use: + +.. code-block:: rst + + :ref:`sphx_glr_auto_examples_99_my_example.py` + + + +Merging your example +----------------------- +Finally, if everything looks good, commit your changes and submit a pull request +to the repository. For more information, see the :ref:`contributing guide `. + + +Your PR will be reviewed by the maintainers, who may suggest changes or improvements. +Once approved, it will be merged into the main branch, and your example will +become part of the official documentation. Thank you! diff --git a/skrub/_docs/tutorials/0000_getting_started.py b/skrub/_docs/tutorials/0000_getting_started.py new file mode 100644 index 000000000..4e7cca8f8 --- /dev/null +++ b/skrub/_docs/tutorials/0000_getting_started.py @@ -0,0 +1,221 @@ +""" +Getting Started with skrub +========================== + +This guide showcases some of the features of skrub. +Much of skrub revolves around simplifying many of the tasks that are involved +in pre-processing raw data into a format that shallow or classic machine-learning +models can understand, that is, numerical data. + +Skrub achieves this by vectorizing, assembling, and encoding tabular data through +the features we present in this example and the following ones. + +.. |TableReport| replace:: :class:`~skrub.TableReport` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` +.. |set_config| replace:: :func:`~skrub.set_config` +.. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |Joiner| replace:: :class:`~skrub.Joiner` +.. |SquashingScaler| replace:: :class:`~skrub.SquashingScaler` +.. |DatetimeEncoder| replace:: :class:`~skrub.DatetimeEncoder` +.. |ApplyToCols| replace:: :class:`~skrub.ApplyToCols` +.. |StringEncoder| replace:: :class:`~skrub.StringEncoder` +.. |TextEncoder| replace:: :class:`~skrub.TextEncoder` +""" + +# %% +# Preliminary exploration with the |TableReport| +# ---------------------------------------------- +# We start by loading the "employee salaries". Skrub dataset fetching functions +# return a Bunch object, which contains the paths to the data files. +# We can load the data into a dataframe using pandas. + +import pandas as pd + +from skrub.datasets import fetch_employee_salaries + +file_path = fetch_employee_salaries().path +employees_df = pd.read_csv(file_path) + +# %% +# The target variable is the current annual salary. We pop it from the dataframe +# to keep only the features in ``employees_df``. +salaries = employees_df.pop("current_annual_salary") + +# %% +# Typically, the first step with new data is exploration and parsing. +# To quickly get an overview of a dataframe's contents, use the |TableReport|. + +# %% +from skrub import TableReport + +TableReport(employees_df) + +# %% +# You can use the interactive display above to explore the dataset visually. +# +# .. admonition:: Additional examples +# :collapsible: closed +# +# You can see a few more `example reports`_ online. We also +# provide an experimental online demo_ that allows you to select a CSV or +# parquet file and generate a report directly in your web browser, without +# installing anything. +# +# .. _example reports: https://skrub-data.org/skrub-reports/examples/ +# .. _demo: https://skrub-data.org/skrub-reports/ +# +# From the report above, we see that there are columns with date and time stored +# as ``object`` dtype (cf. "Stats" tab of the report). +# Datatypes not being parsed correctly is a scenario that occurs commonly after +# reading a table. We can use the |Cleaner| to address this. +# In the next section, we show that this transformer does additional cleaning. + +# %% +# Sanitizing data with the |Cleaner| +# ---------------------------------- +# Here, we use the |Cleaner|, a transformer that sanitizes the +# dataframe by parsing nulls and dates, and by dropping "uninformative" columns +# (e.g., columns with too many nulls or that are constant). +# + +from skrub import Cleaner + +employees_df = Cleaner().fit_transform(employees_df) +TableReport(employees_df) + +# %% +# We can see from the "Stats" tab that now the column ``date_first_hired`` has been +# parsed correctly as a Datetime. + +# %% +# Easily building a strong baseline for tabular machine learning +# -------------------------------------------------------------- +# +# The goal of skrub is to ease tabular data preparation for machine learning. +# The |tabular_pipeline| function provides an easy way to build a simple +# but reliable machine learning model that works well on most tabular data. + + +# %% +from sklearn.model_selection import cross_validate + +from skrub import tabular_pipeline + +model = tabular_pipeline("regressor") +model +# %% +results = cross_validate(model, employees_df, salaries) +results["test_score"] + +# %% +# To handle rich tabular data and feed it to a machine learning model, the +# pipeline returned by |tabular_pipeline| preprocesses and encodes +# strings, categories and dates using the |TableVectorizer|. +# See its documentation or :ref:`sphx_glr_auto_examples_0010_encodings.py` for +# more details. An overview of the chosen defaults is available in +# :ref:`user_guide_tabular_pipeline`. + + +# %% +# Encoding any data as numerical features +# --------------------------------------- +# +# Tabular data can contain a variety of datatypes, from numerical to +# datetimes, categories, strings, and text. Encoding features in a meaningful +# way requires significant effort and is a major part of the feature engineering +# process required to properly train machine learning models. +# +# Skrub helps with this by providing various transformers that automatically +# encode different datatypes into ``float32`` features. +# +# For **numerical features**, the |SquashingScaler| applies a robust +# scaling technique that is less sensitive to outliers. Check the +# :ref:`relative example ` +# for more information on the feature. +# +# For **datetime columns**, skrub provides the |DatetimeEncoder| +# which can extract useful features such as year, month, day, as well as additional +# features such as weekday or day of year. Periodic encoding with trigonometric +# or spline features is also available. Refer to the |DatetimeEncoder| +# documentation for more detail. +# + +# %% +import pandas as pd + +data = pd.DataFrame( + { + "event": ["A", "B", "C"], + "date_1": ["2020-01-01", "2020-06-15", "2021-03-22"], + "date_2": ["2020-01-15", "2020-07-01", "2021-04-05"], + } +) +data = Cleaner().fit_transform(data) +TableReport(data) +# %% +# Skrub transformers are applied column-by-column, but it's possible to use +# the |ApplyToCols| meta-transformer to apply a transformer to +# multiple columns at once. Complex column selection is possible using +# :ref:`skrub's column selectors `. + +from skrub import ApplyToCols, DatetimeEncoder + +ApplyToCols( + DatetimeEncoder(add_total_seconds=False), cols=["date_1", "date_2"] +).fit_transform(data) + +# %% +# Finally, when a column contains **categorical or string data**, it can be +# encoded using various encoders provided by skrub. The default encoder is +# the |StringEncoder|, which encodes categories using +# `Latent Semantic Analysis (LSA) `_. +# It is a simple and efficient way to encode categories and works well in +# practice. + +data = pd.DataFrame( + { + "city": ["Paris", "London", "Berlin", "Madrid", "Rome"], + "country": ["France", "UK", "Germany", "Spain", "Italy"], + } +) +TableReport(data) +from skrub import StringEncoder + +StringEncoder(n_components=3).fit_transform(data["city"]) + +# %% +# If your data includes a lot of text, you may want to use the +# |TextEncoder|, +# which uses pre-trained language models retrieved from the HuggingFace hub to +# create meaningful text embeddings. +# See :ref:`user_guide_encoders_index` for more details on all the categorical encoders +# provided by skrub, and :ref:`sphx_glr_auto_examples_0010_encodings.py` for a +# comparison between the different methods. +# + +# %% +# Advanced use cases +# ---------------------- +# If your use case involves more complex data preparation, hyperparameter tuning, +# or model selection, if you want to build a multi-table pipeline that requires +# assembling and preparing multiple tables, or if you want to ensure that the +# data preparation can be reproduced exactly, you can use the skrub Data Ops, +# a powerful framework that provides tools to build complex data processing pipelines. +# See the related :ref:`user guide ` and the +# :ref:`data_ops_examples_ref` +# examples for more details. + +# %% +# Next steps +# ---------- +# +# We have briefly covered pipeline creation, vectorizing, assembling, and encoding +# data. We presented the main functionalities of skrub, but there is much +# more to explore! +# +# Please refer to our :ref:`user_guide` for a more in-depth presentation of +# skrub's concepts, or visit our +# `examples `_ for more +# illustrations of the tools that we provide! +# diff --git a/skrub/_docs/tutorials/1110_data_ops_intro.py b/skrub/_docs/tutorials/1110_data_ops_intro.py new file mode 100644 index 000000000..4f3fd807e --- /dev/null +++ b/skrub/_docs/tutorials/1110_data_ops_intro.py @@ -0,0 +1,210 @@ +""" +Tutorial: Using Data Ops to build a machine-learning pipeline +======================================================================= + +.. currentmodule:: skrub + +.. |fetch_employee_salaries| replace:: :func:`datasets.fetch_employee_salaries` +.. |TableReport| replace:: :class:`TableReport` +.. |var| replace:: :func:`var` +.. |skb.mark_as_X| replace:: :meth:`DataOp.skb.mark_as_X` +.. |skb.mark_as_y| replace:: :meth:`DataOp.skb.mark_as_y` +.. |TableVectorizer| replace:: :class:`TableVectorizer` +.. |ToDatetime| replace:: :class:`ToDatetime` +.. |skb.apply| replace:: :meth:`.skb.apply() ` +.. |HistGradientBoostingRegressor| replace:: + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` +.. |.skb.full_report()| replace:: :meth:`.skb.full_report() ` +.. |choose_float| replace:: :func:`choose_float` +.. |make_randomized_search| replace:: + :meth:`.skb.make_randomized_search ` + +This example shows data how we can use skrub's +:ref:`DataOps ` for building a machine learning pipeline. + +The challenge of preparing data for machine learning is the need to +apply the same data preparation and wrangling operations to new data, for prediction. + +Skrub's DataOps build pipelines that blend data wrangling and machine +learning by recording all the operations involved in pre-processing data +and training models, as well as the state of the transformers and models used to +make predictions. + +.. admonition:: What is a state? + :collapsible: closed + + The state of a transformer or model refers to the internal parameters and + attributes that are learned or set during the fitting process. For example, + in a :class:`~sklearn.preprocessing.StandardScaler`, the state would include + the mean and standard deviation calculated from the training data. + In a pre-processing transformer like |ToDatetime|, the state would include the + inferred datetime format based on the data it was fitted on. + In a machine learning model like |HistGradientBoostingRegressor|, the state + would include the fitted parameters of the model after training on the data. + +The result of building a DataOps plan is a *learner*, an object with an interface +similar to that of a scikit-learn estimator, but which contains all the steps in the +data preparation and model training process, along with the state of all the +transformers and models: this allows to save the learner, load it back later, +and use it to make predictions on new data. + +This example is meant to be an introduction to skrub DataOps, and as such it +will not cover all the features. Further examples in the gallery +:ref:`data_ops_examples_ref` go into more detail on skrub DataOps +for more complex tasks. + + +""" + +# %% +# The data +# --------- +# +# We begin by loading the employee salaries dataset, which is a regression dataset +# that contains information about employees and their current annual salaries. +# By default, the |fetch_employee_salaries| function returns the training set. +# We will load the test set later, to evaluate our model on unseen data. + +import pandas as pd + +from skrub.datasets import fetch_employee_salaries + +training_data = pd.read_csv( + fetch_employee_salaries(split="train").employee_salaries_path +) + +# %% +# We can take a look at the dataset using the |TableReport|. +# This dataset contains numerical, categorical, and datetime features. The column +# ``current_annual_salary`` is the target variable we want to predict. +# + +import skrub + +skrub.TableReport(training_data) +# %% +# Assembling our DataOps plan +# ---------------------------- +# +# Our goal is to predict the ``current_annual_salary`` of employees based on their +# other features. We will use skrub's DataOps to combine both skrub and scikit-learn +# objects into a single DataOps plan, which will allow us to preprocess the data, +# train a model, and tune hyperparameters. +# +# We begin by defining a skrub |var|, which is the entry point for our DataOps plan. + +data_var = skrub.var("data", training_data) + +# %% +# Next, we define the initial features ``X`` and the target variable ``y``. +# We use the |skb.mark_as_X| and |skb.mark_as_y| methods to mark these variables +# in the DataOps plan. This allows skrub to properly split these objects into +# training and validation steps when executing cross-validation or hyperparameter +# tuning. + +X = data_var.drop("current_annual_salary", axis=1).skb.mark_as_X() +y = data_var["current_annual_salary"].skb.mark_as_y() +# %% +# Our first step is to vectorize the features in ``X``. We will use the +# |TableVectorizer| to convert the categorical and numerical features into a +# numerical format that can be used by machine learning algorithms. +# We apply the vectorizer to ``X`` using the |skb.apply| method, which allows us to +# apply any scikit-learn compatible transformer to the skrub variable. + +from skrub import TableVectorizer + +vectorizer = TableVectorizer() + +X_vec = X.skb.apply(vectorizer) +X_vec +# %% +# By clicking on ``Show graph``, we can see the DataOps plan that has been created: +# the plan shows the steps that have been applied to the data so far. +# Now that we have the vectorized features, we can proceed to train a model. +# We use a scikit-learn |HistGradientBoostingRegressor| to predict the target variable. +# We apply the model to the vectorized features using ``.skb.apply``, and pass +# ``y`` as the target variable. +# Note that the resulting ``predictor`` variable shows prediction results on the +# preview subsample, but the model will be properly fitted when we create the learner. + +from sklearn.ensemble import HistGradientBoostingRegressor + +hgb = HistGradientBoostingRegressor() + +predictor = X_vec.skb.apply(hgb, y=y) +predictor + +# %% +# Now that we have built our entire plan, we can explore it in more detail +# with the |.skb.full_report()| method:: +# +# predictor.skb.full_report() +# +# This produces a folder on disk rather than displaying inline in a notebook so +# we do not run it here. But you can +# `see the output here <../../_static/employee_salaries_report/index.html>`_. +# +# This method evaluates each step in +# the plan and shows detailed information about the operations that are being performed. + +# %% +# Turning the DataOps plan into a learner, for later reuse +# --------------------------------------------------------- +# +# Now that we have defined the predictor, we can create a ``learner``, a +# standalone object that contains all the steps in the DataOps plan. We fit the +# learner, so that it can be used to make predictions on new data. + +trained_learner = predictor.skb.make_learner(fitted=True) + +# %% +# A big advantage of the learner is that it can be pickled and saved to disk, +# allowing us to reuse the trained model later without needing to retrain it. +# The learner contains all steps in the DataOps plan, including the fitted +# vectorizer and the trained model. We can save it using Python's ``pickle`` module. +# Here we use ``pickle.dumps`` to serialize the learner object into a byte string. + +import pickle + +saved_model = pickle.dumps(trained_learner) + +# %% +# We can now load the saved model back into memory using ``pickle.loads``. +loaded_model = pickle.loads(saved_model) + +# %% +# Now, we can make predictions on new data using the loaded model, by passing +# a dictionary with the skrub variable names as keys. +# We don't have to create a new variable, as this will be done internally by the +# learner. +# In fact, the ``learner`` is similar to a scikit-learn estimator, but rather +# than taking ``X`` and ``y`` as inputs, it takes a dictionary (the "environment") +# where each key corresponds to the name of a skrub variable in the plan (in this +# case, "data"). +# +# We can now get the test set of the employee salaries dataset: +unseen_data = pd.read_csv(fetch_employee_salaries(split="test").employee_salaries_path) + +# %% +# Then, we can use the loaded model to make predictions on the unseen data by +# passing a dictionary with the variable name as the key. + +predicted_values = loaded_model.predict({"data": unseen_data}) +predicted_values + +# %% +# We can also evaluate the model's performance using the `score` method, which +# uses the scikit-learn scoring function used by the predictor: +loaded_model.score({"data": unseen_data}) + +# %% +# Conclusion +# ---------- +# +# In this example, we have briefly introduced the skrub DataOps and how they can +# be used to build powerful machine learning pipelines. We have shown how to preprocess +# data and train a model. We have also demonstrated how to save and load the trained +# model, and how to make predictions on new data. +# +# However, skrub DataOps are significantly more powerful than what we have shown here. +# For more advanced examples, see :ref:`data_ops_examples_ref`. diff --git a/skrub/_docs/tutorials/GALLERY_HEADER.txt b/skrub/_docs/tutorials/GALLERY_HEADER.txt new file mode 100644 index 000000000..1e107f52e --- /dev/null +++ b/skrub/_docs/tutorials/GALLERY_HEADER.txt @@ -0,0 +1 @@ +examples diff --git a/skrub/_docs/vision.rst b/skrub/_docs/vision.rst new file mode 100644 index 000000000..37fde54d0 --- /dev/null +++ b/skrub/_docs/vision.rst @@ -0,0 +1,64 @@ +=============================== +Vision: Where is skrub heading? +=============================== + +.. currentmodule:: skrub + +Vision Statement +================ + +The goal of skrub is to facilitate machine learning on tables: +`pandas `__ +and `polars `__ dataframes, SQL databases, and more. + +| + +Skrub is high-level, with a philosophy and API matching that of +`scikit-learn `_. It strives to bridge the world +of databases and machine learning, **enabling imperfect assembly and +representation of data when it is noisy**, using the downstream +target to guide assembly when possible (supervised learning for +data assembly). + +In the long term, as skrub is built on higher-level APIs, it will make it +easier for data scientists to use efficient database patterns and +backends. + +Skrub seeks tradeoffs in terms of flexibility: its high-level APIs are by +construction restrictive compared to directly manipulating dataframes. +This is by design, as skrub does not aim to replace tools such as `Pandas +`__, `Ibis `__, +`DuckDB `_. + +To make things simpler, skrub uses defaults that are chosen empirically to +give good machine learning, even though these are sometimes heuristic, as +in the :class:`TableVectorizer`. We keep the benchmarks used to choose the defaults +in a separate `repository `__. + +Roadmap +======= + +In an open-source project, roadmaps can be whishful thinking: things +happen in an iterative way, often guided by the community. + +We however decided to communicate on what we would like to do in the next +6 months to give a better idea of the vision. + +From shorter term to longer term: + +- Better support for time series + +- Data namespaces, lazy data loading, out of core computing using + database engines (e.g., duckdb) + +- Join discovery to work in data lakes where the tables are not in a + clean relational database + +- Automatic feature synthesis in databases, building on the assembling + features + + +Imputation is out of skrub's scope: scikit-learn implements transformers +that perform imputation. Academic work has also shown that imputation is +expensive and often does not improve prediction results +(https://arxiv.org/pdf/2407.19804). diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 3a096a385..47a5b3553 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -590,9 +590,11 @@ class TableVectorizer(TransformerMixin, SkrubBaseEstimator): specified transformer. This disables any preprocessing usually done by the TableVectorizer; the columns are passed to the transformer without any modification. A column is not allowed to appear twice in - ``specific_transformers``. Using ``specific_transformers`` provides - similar functionality to what is offered by scikit-learn's - :class:`~sklearn.compose.ColumnTransformer`. + ``specific_transformers``. + Consider wrapping the ``TableVectorizer`` in :class:`~skrub.ApplyToCols` + to select or exclude specific columns from the processing. Alternatively, + the :ref:`skrub Data Ops ` allow for more complex + pre-processing. drop_null_fraction : float or None, default=1.0 Fraction of null above which the column is dropped. If `drop_null_fraction` is diff --git a/skrub/conftest.py b/skrub/conftest.py index 4bbfed789..3e5f7b661 100644 --- a/skrub/conftest.py +++ b/skrub/conftest.py @@ -34,6 +34,11 @@ def _example_data_dict(): } +# this is needed to ignore the skrub/_docs folder when running pytest +# otherwise, the examples in the folder are executed any time test discovery is run +collect_ignore_glob = ["_docs/**/*.py"] + + _DATAFRAME_MODULES_INFO = {} _DATAFRAME_MODULES_INFO["pandas-numpy-dtypes"] = SimpleNamespace( **{