diff --git a/.github/workflows/build-docker-image.yaml b/.github/workflows/build-docker-image.yaml
index 8c1efcc..ba01b2a 100644
--- a/.github/workflows/build-docker-image.yaml
+++ b/.github/workflows/build-docker-image.yaml
@@ -2,19 +2,26 @@ name: Build Docker Image
 
 on:
   workflow_dispatch:
+
+  # Trigger on tag push
   push:
     tags:
-      - 'v*'
+      - 'v*' # Matches tags starting with 'v', like 'v1.0', 'v2.1.3', etc.
+
 
 jobs:
+  # Build and push Docker image
   docker:
     runs-on: ubuntu-22.04
+
     steps:
+      # Checkout repository
       - name: Checkout
         uses: actions/checkout@v6
         with:
           submodules: recursive
 
+      # Set up QEMU and Docker Buildx
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
 
@@ -42,7 +49,10 @@ jobs:
         with:
           context: .
           file: docker/simple/Dockerfile
+
+          # Currently only building for linux/amd64, due to ARM issues with some dependencies
           platforms: linux/amd64
+
           push: true
           tags: |
             ghcr.io/thebigeye/llama-cpp-python:latest
diff --git a/.github/workflows/build-release-publish.yaml b/.github/workflows/build-release-publish.yaml
index 2a31aff..db3e1c3 100644
--- a/.github/workflows/build-release-publish.yaml
+++ b/.github/workflows/build-release-publish.yaml
@@ -1,7 +1,12 @@
-# Links for {package_name}
-
 name: Build and Publish Wheels
 
+# NOTE: This was a headcache to set up GitHub Actions for building and publishing wheels.
+# - This workflow builds wheels for CPU and CUDA (12.1 to 12.4) on multiple Python versions.
+# - It creates GitHub Releases for each build and uploads the wheels there.
+# - It generates a PEP 503-compliant simple index for installing via pip from GitHub Pages.
+# - The workflow is triggered on new tags (v*), manual dispatch, and weekly schedule.
+# This could broke on any changes to the repo structure or GitHub Actions environment. (i need make this more robust eventually)
+
 on:
   workflow_dispatch:
   push:
@@ -9,15 +14,18 @@ on:
   schedule:
     - cron: '0 0 * * 0'
 
+# permissions needed for releases and GitHub Pages
 permissions:
   contents: write
   pages: write
   id-token: write
 
+# limit concurrent, i mean, don't run multiple builds at once
 concurrency:
   group: "pages"
   cancel-in-progress: false
 
+
 jobs:
   get_version:
     name: Get Version
@@ -29,6 +37,8 @@ jobs:
       - name: Extract version
         id: set-version
         shell: pwsh
+
+        # extract __version__ from llama_cpp/__init__.py
         run: |
           $version = Select-String -Path "llama_cpp/__init__.py" -Pattern '__version__ = "([^"]+)"' | % { $_.Matches.Groups[1].Value }
           Write-Output "version=$version" >> $env:GITHUB_OUTPUT
@@ -155,7 +165,7 @@ jobs:
       - name: Install dependencies (Windows)
         if: runner.os == 'Windows'
         env:
-          RUST_LOG: trace        
+          RUST_LOG: trace
         run: |
           python -m pip install --upgrade pip
           python -m pip install uv
@@ -178,6 +188,12 @@ jobs:
           if-no-files-found: warn
           overwrite: true
 
+  # Collect and Release Jobs
+  # Here we collect the built wheels and create GitHub Releases for each configuration
+  # if a release with the same tag exists, we delete it first to avoid conflicts
+  # then we create a new release and upload the wheels, overwriting existing assets if any
+  # This ensures that the latest wheels are always available under the same release tags
+
   collect_cpu:
     name: Collect and Release CPU Wheels
     needs: [get_version, build_wheels_cpu]
@@ -209,7 +225,7 @@ jobs:
           tag="v${{ needs.get_version.outputs.version }}"
           git tag -f "$tag"
           git push -f origin "$tag"
-          gh release create "$tag" --title "CPU wheels v${{ needs.get_version.outputs.version }}" --notes "Automated build"
+          gh release create "$tag" --title "v${{ needs.get_version.outputs.version }}" --notes "Automated build"
 
       - name: Upload wheels
         run: gh release upload "v${{ needs.get_version.outputs.version }}" dist/*.whl --clobber
@@ -243,19 +259,23 @@ jobs:
           tag="v${{ needs.get_version.outputs.version }}-${{ matrix.short }}"
           git tag -f "$tag"
           git push -f origin "$tag"
-          gh release create "$tag" --title "CUDA ${{ matrix.short }} wheels v${{ needs.get_version.outputs.version }}" --notes "Automated build"
+          gh release create "$tag" --title "v${{ needs.get_version.outputs.version }}-${{ matrix.short }}" --notes "Automated build"
 
       - name: Upload wheels
         run: gh release upload "v${{ needs.get_version.outputs.version }}-${{ matrix.short }}" dist/*.whl --clobber
 
+  # Generate PEP 503-compliant simple index for pip installation from GitHub Pages
+  # See: https://peps.python.org/pep-0503/
   generate_pypi_index:
     name: Generate PyPI Index
     needs: [get_version, collect_cpu, collect_cuda]
     runs-on: ubuntu-22.04
     env:
       GITHUB_TOKEN: ${{ github.token }}
+
     steps:
       - name: Create base directory
+        # TODO: make this less hardcoded
         run: mkdir -p dist/whl/{cpu,cu121,cu122,cu123,cu124}
 
       - name: Generate PEP 503 indices using GitHub API
@@ -264,6 +284,10 @@ jobs:
           REPO: ${{ github.repository }}
           REPO_OWNER: ${{ github.repository_owner }}
           REPO_NAME: ${{ github.event.repository.name }}
+
+        # Due to eventual consistency delays in GitHub Releases API, we implement retries here
+        # so that the action waits for the releases to be available and avoid missing assets and wheels.
+        # Yes, i could have a external script, but this is easier to manage in one file for now.
         run: |
           python - << 'PYEOF'
           import os
@@ -278,6 +302,8 @@ jobs:
 
           package_name = "llama-cpp-python"
 
+          # Define configurations and their corresponding release tags
+          # by example: 'cu121' -> 'v{version}-cu121'
           configs = {
               'cpu': {'tag': f'v{version}', 'label': 'CPU'},
               'cu121': {'tag': f'v{version}-cu121', 'label': 'CUDA 12.1'},
@@ -286,6 +312,7 @@ jobs:
               'cu124': {'tag': f'v{version}-cu124', 'label': 'CUDA 12.4'},
           }
 
+          # GitHub API headers
           headers = {
               'Authorization': f'token {token}',
               'Accept': 'application/vnd.github.v3+json'
@@ -293,10 +320,12 @@ jobs:
 
           available_configs = []
 
+          # Process each configuration
+          # For each, fetch the release info and generate the simple index files
           for config, info in configs.items():
               tag = info['tag']
               url = f'https://api.github.com/repos/{repo}/releases/tags/{tag}'
-              
+
               # Retry up to 5 times (releases can take seconds to propagate)
               for attempt in range(5):
                   response = requests.get(url, headers=headers)
@@ -347,7 +376,7 @@ jobs:
               '  <body>',
               '    <h1>🦙 llama-cpp-python Wheels</h1>',
               '    <p>Pre-compiled wheels with CPU and CUDA support (hosted on GitHub Releases).</p>',
-              '    <p>Note: pip may warn about no hash verification – safe as downloads are direct from GitHub.</p>',
+              '    <p>Note: pip may warn about no hash verification - safe as downloads are direct from GitHub.</p>',
           ]
 
           for config, label in available_configs:
@@ -374,15 +403,17 @@ jobs:
           print("✓ Generated root index")
           PYEOF
 
+      # Create README for the GitHub Pages site
+      # TODO: make it more beautiful later
       - name: Create README
         run: |
           cat > dist/README.md << 'EOF'
           # 🦙 llama-cpp-python Wheels
-          
+
           Pre-compiled wheels with CPU and CUDA support.
-          
+
           See main page for installation commands.
-          
+
           **Repository**: https://github.com/${{ github.repository }}
           EOF
 
diff --git a/.github/workflows/build-wheel-cpu.yml b/.github/workflows/build-wheel-cpu.yml
index e8ecab7..281515b 100644
--- a/.github/workflows/build-wheel-cpu.yml
+++ b/.github/workflows/build-wheel-cpu.yml
@@ -14,40 +14,26 @@ jobs:
       - uses: actions/checkout@v6
         with:
           submodules: recursive
-          
       - uses: actions/setup-python@v6
         with:
           python-version: "3.11"
-          
       - name: Install cibuildwheel
         run: |
           python -m pip install -U pip
           python -m pip install -U cibuildwheel wheel
-          
-      - name: Build wheel (CPU only, skip repair)
+      - name: Build wheel (CPU only)
         env:
-          CIBW_BUILD: "cp310-manylinux_x86_64"
+          # Build wheels for Python 3.9 to 3.14 on manylinux_x86_64
+          CIBW_BUILD: "cp39-manylinux_x86_64 cp310-manylinux_x86_64 cp311-manylinux_x86_64 cp312-manylinux_x86_64 cp313-manylinux_x86_64 cp314-manylinux_x86_64"
           CIBW_SKIP: "*-musllinux_*"
           CIBW_BUILD_VERBOSITY: 1
-          CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
+          # Use newer manylinux_2_28 image for better compatibility with recent Python versions (3.12+) and modern glibc
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: ""
           CIBW_ENVIRONMENT_LINUX: >-
             CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_VULKAN=OFF -DGGML_BLAS=OFF -DGGML_NATIVE=OFF"
         run: |
-          python -m cibuildwheel --output-dir wheelhouse_temp
-          
-      - name: Rename wheel to manylinux
-        run: |
-          mkdir -p wheelhouse
-          cd wheelhouse_temp
-          for wheel in *.whl; do
-            new_name=$(echo "$wheel" | sed 's/linux_x86_64/manylinux_2_17_x86_64.manylinux2014_x86_64/')
-            mv "$wheel" "../wheelhouse/$new_name"
-            echo "Created: $new_name"
-          done
-          cd ..
-          ls -lh wheelhouse/
-          
+          python -m cibuildwheel --output-dir wheelhouse
       - uses: actions/upload-artifact@v6
         with:
           name: wheelhouse-linux
@@ -61,29 +47,25 @@ jobs:
       - uses: actions/checkout@v6
         with:
           submodules: recursive
-          
       - uses: actions/setup-python@v6
         with:
           python-version: "3.11"
-          
       - name: Install cibuildwheel
         run: |
           python -m pip install -U pip
           python -m pip install -U cibuildwheel wheel
-          
       - name: Build wheel (CPU only)
         env:
-          CIBW_BUILD: "cp310-win_amd64"
+          # Build wheels for Python 3.9 to 3.14 on win_amd64
+          CIBW_BUILD: "cp39-win_amd64 cp310-win_amd64 cp311-win_amd64 cp312-win_amd64 cp313-win_amd64 cp314-win_amd64"
           CIBW_BUILD_VERBOSITY: 1
           CIBW_ENVIRONMENT_WINDOWS: >-
             CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_VULKAN=OFF -DGGML_BLAS=OFF -DGGML_NATIVE=OFF"
         run: |
           python -m cibuildwheel --output-dir wheelhouse
-          
       - name: List wheels
         run: |
           Get-ChildItem -Path wheelhouse -Recurse
-          
       - uses: actions/upload-artifact@v6
         with:
           name: wheelhouse-windows
@@ -100,10 +82,8 @@ jobs:
         with:
           path: wheelhouse
           merge-multiple: true
-          
       - name: List all wheels
         run: ls -lh wheelhouse/
-        
       - name: Upload to Release
         uses: softprops/action-gh-release@v2
         with:
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index f98bfde..26922f6 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -49,7 +49,7 @@ jobs:
         with:
           vs-version: '[16.11,16.12)'
 
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v4
         with:
           submodules: "recursive"
 
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b13e030..d66737a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -6,6 +6,7 @@ on:
   push:
     branches:
       - main
+      - dev
 
 env:
   REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF
@@ -39,12 +40,12 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
     steps:
       - uses: actions/checkout@v6
         with:
           submodules: "recursive"
-          
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v6
         with:
@@ -70,12 +71,12 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
     steps:
       - uses: actions/checkout@v6
         with:
           submodules: "recursive"
-          
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v6
         with:
@@ -94,7 +95,7 @@ jobs:
           python -m pip install uv
           python -m uv pip install -e .[all] --verbose
         shell: cmd
-          
+
       - name: Test with pytest
         run: |
           python -m pytest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98..0a53c03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ if (LLAMA_BUILD)
     # When building, don't use the install RPATH already
     # (but later on when installing)
     set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
- 
+
     # Add the automatically determined parts of the RPATH
     # which point to directories outside the build tree to the install RPATH
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
@@ -153,6 +153,15 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Set version for mtmd (required by upstream CMakeLists.txt)
+        # NOTE: This is a workaround for mtmd build requirements.
+        # Version is set to 0.0.0 for local builds. If upstream adds version
+        # compatibility checks, this may need to match llama.cpp version.
+        if (NOT DEFINED LLAMA_BUILD_NUMBER)
+            set(LLAMA_BUILD_NUMBER 0)
+        endif()
+        set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
@@ -185,3 +194,4 @@ if (LLAMA_BUILD)
         # target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
     endif()
 endif()
+
diff --git a/README.md b/README.md
index 3e4ab9f..04d6af0 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,29 @@
 <p align="center">
-  <img src="https://raw.githubusercontent.com/TheBigEye/llama-cpp-python/main/docs/icon.svg" style="height: 20rem; width: 20rem">
+  <img src="https://raw.githubusercontent.com/TheBigEye/llama-cpp-python/main/docs/icon.svg" style="height: 16rem; width: 16rem">
 </p>
 
 #  Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
 
 [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
-[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
-[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python)
-[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
+[![Tests](https://github.com/TheBigEye/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/TheBigEye/llama-cpp-python/actions/workflows/test.yaml)
+[![Github All Releases](https://img.shields.io/github/downloads/TheBigEye/llama-cpp-python/total.svg?label=Github%20Downloads)]()
+<!--- [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) --->
+<!--- [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) --->
+<!--- [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) --->
+<!--- [![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python) --->
+
 
 Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
 
-> **Fork notice**
+> [!NOTE]
 >
-> This repository is a **fork** of the original [`abetlen/llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) project. The original repository has been largely inactive and unmaintained for some time, so this fork was created to follow a different path: **dedicated CPU and CUDA only support (no METAL, no MacOS)** for better stability and simplicity.
+> This repository is a **fork** of the original **[`abetlen/llama-cpp-python`](https://github.com/abetlen/llama-cpp-python)** project. The original repository has been largely inactive and unmaintained for some time, 
+and because one of my most important projects depends on this, so this fork was created to maintain the original repo and also follow a different path: **dedicated CPU and CUDA only support (no METAL, no MacOS)** for better stability and simplicity.
 >
-> If you require METAL / ARM support, please refer to the original repository: [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) — this fork intentionally focuses on CPU and CUDA only usage.
+> If you require METAL / ARM support, please refer to the original repository: **[https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)** — this fork intentionally focuses on CPU and CUDA only usage.
 
-> NOTE: This readme is still under development, so all documentation still refers to the original repository and module.
+> [!NOTE]
+> This readme is still under development, so all documentation still refers to the original repository and module. 
 
 This package provides:
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde70..0d56603 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.4.0"
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
index e88ed38..33f1cb5 100644
--- a/llama_cpp/_ctypes_extensions.py
+++ b/llama_cpp/_ctypes_extensions.py
@@ -54,6 +54,7 @@ def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
         os.add_dll_directory(str(base_path))
         if "CUDA_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin", "x64"))
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
         if "HIP_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b5175a7..bc36464 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -75,11 +75,7 @@ def free_model():
         self._exit_stack.callback(free_model)
 
     def close(self):
-        if self.sampler is not None:
-            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
-            for i, _ in reversed(self.custom_samplers):
-                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
-            self.custom_samplers.clear()
+        # NOTE: LlamaModel doesn't manage samplers (that's LlamaSampler's job)
         self._exit_stack.close()
 
     def __del__(self):
@@ -292,19 +288,26 @@ def kv_cache_clear(self):
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
         assert self.memory is not None, "Memory is not initialized"
-        seq_id = seq_id if seq_id >= 0 else 0
+        # seq_id < 0 means "all sequences" - this is valid per llama.cpp docs
         llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
         assert self.memory is not None, "Memory is not initialized"
+        # Negative seq_id not documented for cp - require non-negative IDs
+        assert seq_id_src >= 0, f"seq_id_src must be >= 0, got {seq_id_src}"
+        assert seq_id_dst >= 0, f"seq_id_dst must be >= 0, got {seq_id_dst}"
         llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
         assert self.memory is not None, "Memory is not initialized"
+        # Negative seq_id not documented for keep - require non-negative ID
+        assert seq_id >= 0, f"seq_id must be >= 0, got {seq_id}"
         llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
         assert self.memory is not None, "Memory is not initialized"
+        # Negative seq_id not documented for shift - require non-negative ID
+        assert seq_id >= 0, f"seq_id must be >= 0, got {seq_id}"
         llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:
@@ -355,7 +358,9 @@ def get_embeddings_seq(self, seq_id: int):
     # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "set_rng_seed is deprecated, use LlamaSampler instead"
+        )
 
     def sample_repetition_penalties(
         self,
@@ -366,30 +371,44 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_repetition_penalties is deprecated, use LlamaSampler instead"
+        )
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_softmax is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_k is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_min_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_typical is deprecated, use LlamaSampler instead"
+        )
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
         raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_grammar is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat(
         self,
@@ -399,7 +418,9 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat_v2(
         self,
@@ -408,17 +429,25 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_greedy is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token is deprecated, use LlamaSampler instead"
+        )
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "grammar_accept_token is deprecated, use LlamaSampler instead"
+        )
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -493,7 +522,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
             self.batch.seq_id[j][0] = seq_id
             self.batch.n_seq_id[j] = 1
             self.batch.logits[j] = logits_all
-        self.batch.logits[n_tokens - 1] = True
+        self.batch.logits[n_tokens0 + n_tokens - 1] = True
 
 
 class LlamaTokenDataArray:
@@ -602,16 +631,16 @@ def sample(
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
         # This method is deprecated in favor of using LlamaSampler directly
-        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead"
+        )
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
 
 
 class CustomSampler:
-    def __init__(
-        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
-    ):
+    def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]):
         self.apply_func = apply_func
 
         def apply_wrapper(
@@ -646,6 +675,7 @@ def __init__(self):
         params = llama_cpp.llama_sampler_chain_default_params()
         self.sampler = llama_cpp.llama_sampler_chain_init(params)
         self.custom_samplers: List[Tuple[int, CustomSampler]] = []
+        self._pinned_buffers: List[ctypes.Array] = []  # Pin C arrays to prevent GC
         self._exit_stack = ExitStack()
 
         def free_sampler():
@@ -723,20 +753,24 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar_lazy_patterns(
-        self, 
-        model: LlamaModel, 
+        self,
+        model: LlamaModel,
         grammar: LlamaGrammar,
         trigger_patterns: List[str],
-        trigger_tokens: List[int]
+        trigger_tokens: List[int],
     ):
         # Convert patterns to C array
         pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
         for i, pattern in enumerate(trigger_patterns):
             pattern_ptrs[i] = pattern.encode("utf-8")
-        
+
         # Convert tokens to C array
         token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
-        
+
+        # Pin buffers to prevent garbage collection while C code may reference them
+        self._pinned_buffers.append(pattern_ptrs)
+        self._pinned_buffers.append(token_array)
+
         sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
             model.vocab,
             grammar._grammar.encode("utf-8"),
@@ -744,7 +778,7 @@ def add_grammar_lazy_patterns(
             pattern_ptrs,
             len(trigger_patterns),
             token_array,
-            len(trigger_tokens)
+            len(trigger_tokens),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -771,13 +805,16 @@ def add_dry(
         dry_base: float,
         dry_allowed_length: int,
         dry_penalty_last_n: int,
-        seq_breakers: List[str]
+        seq_breakers: List[str],
     ):
         # Convert seq_breakers to C array
         breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
         for i, breaker in enumerate(seq_breakers):
             breaker_ptrs[i] = breaker.encode("utf-8")
-        
+
+        # Pin buffer to prevent garbage collection
+        self._pinned_buffers.append(breaker_ptrs)
+
         sampler = llama_cpp.llama_sampler_init_dry(
             model.vocab,
             n_ctx_train,
@@ -786,25 +823,22 @@ def add_dry(
             dry_allowed_length,
             dry_penalty_last_n,
             breaker_ptrs,
-            len(seq_breakers)
+            len(seq_breakers),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_logit_bias(
-        self, 
-        n_vocab: int, 
-        logit_bias: Dict[int, float]
-    ):
+    def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]):
         # Convert logit_bias dict to C array
         bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
         for i, (token, bias) in enumerate(logit_bias.items()):
             bias_array[i].token = token
             bias_array[i].bias = bias
-        
+
+        # Pin buffer to prevent garbage collection
+        self._pinned_buffers.append(bias_array)
+
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab,
-            len(logit_bias),
-            bias_array
+            n_vocab, len(logit_bias), bias_array
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -838,15 +872,17 @@ def reset(self):
     def clone(self):
         # NOTE: Custom samplers cannot be cloned due to Python callback limitations
         if self.custom_samplers:
-            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
-        
+            raise NotImplementedError(
+                "Cannot clone LlamaSampler that contains custom samplers"
+            )
+
         cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
         # Create a new wrapper around the cloned sampler
         new_sampler = LlamaSampler.__new__(LlamaSampler)
         new_sampler.sampler = cloned_sampler
         new_sampler.custom_samplers = []
         new_sampler._exit_stack = ExitStack()
-        
+
         def free_sampler():
             if new_sampler.sampler is not None:
                 llama_cpp.llama_sampler_free(new_sampler.sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94eb..b617d10 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -91,9 +91,9 @@ def __init__(
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
-        flash_attn: bool = False,
         op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
+        flash_attn: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -173,7 +173,7 @@ def __init__(
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
-            flash_attn: Use flash attention.
+            flash_attn: Use flash attention. None = auto, True = enabled, False = disabled.
             op_offload: offload host tensor operations to device
             swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
@@ -190,6 +190,11 @@ def __init__(
             type_v: KV cache data type for V (default: f16)
             spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
+        Note:
+            Recurrent and hybrid models (Mamba, RWKV, Nemotron-A3B, Jamba) cannot
+            rewind their state and require full reset on history edits. This is handled
+            automatically to maintain compatibility. Standard transformers are unaffected.
+
         Raises:
             ValueError: If the model path does not exist.
 
@@ -341,7 +346,16 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        if flash_attn is None:
+            self.context_params.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_AUTO
+        elif flash_attn:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            )
+        else:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -546,6 +560,11 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Cache recurrent/hybrid model detection to avoid repeated FFI calls
+        self._is_recurrent_model = llama_cpp.llama_model_is_recurrent(
+            self._model.model
+        ) or llama_cpp.llama_model_is_hybrid(self._model.model)
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -573,6 +592,19 @@ def eval_logits(self) -> Deque[List[float]]:
             maxlen=self._n_ctx if self._logits_all else 1,
         )
 
+    @property
+    def _is_recurrent(self) -> bool:
+        """Check if model is recurrent (SSM) or hybrid (SSM+Attention).
+
+        These models (Mamba, RWKV, Nemotron, Jamba, etc.) cannot rewind their
+        recurrent state without snapshots. Only strict forward progression or
+        full reset is allowed.
+
+        Returns:
+            True if model has recurrent state that cannot be rewound.
+        """
+        return self._is_recurrent_model
+
     def tokenize(
         self, text: bytes, add_bos: bool = True, special: bool = False
     ) -> List[int]:
@@ -631,6 +663,11 @@ def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
 
+        if self._is_recurrent:
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
+
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
@@ -882,19 +919,29 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
+
+            # Recurrent models cannot rewind state; reset if needed
+            if self._is_recurrent and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+                if self.verbose:
+                    print(
+                        "Llama.generate: recurrent model requires full state reset",
+                        file=sys.stderr,
+                    )
+
             if longest_prefix > 0:
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
                 if self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match hit, "
-                        f"remaining {len(tokens)} prompt tokens to eval",
+                        f"Llama.generate: {longest_prefix} prefix-match hit, {len(tokens)} tokens to eval",
                         file=sys.stderr,
                     )
 
@@ -934,7 +981,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -1041,7 +1089,9 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1112,7 +1162,9 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
         self.reset()
 
         if return_count:
@@ -1157,9 +1209,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = self._model.token_prefix()
+        middle_token_id: int = self._model.token_middle()
+        suffix_token_id: int = self._model.token_suffix()
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1367,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2108,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2096,7 +2151,7 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=self.context_params.flash_attn_type,
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
@@ -2316,19 +2371,23 @@ def from_pretrained(
         )
 
         if additional_files:
-            for additonal_file_name in additional_files:
+            for additional_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additional_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
-                        f"No file found in {repo_id} that match {additonal_file_name}\n\n"
+                        f"No file found in {repo_id} that match {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(file_list)}"
                     )
 
                 if len(matching_additional_files) > 1:
                     raise ValueError(
-                        f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
+                        f"Multiple files found in {repo_id} matching {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(files)}"
                     )
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a..a4f6b0d 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -33,7 +33,11 @@
 # Specify the base name of the shared library to load
 _lib_base_name = "llama"
 _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH")
-_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path)
+_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _override_base_path is None
+    else pathlib.Path(_override_base_path)
+)
 # Load the library
 _lib = load_shared_library(_lib_base_name, _base_path)
 
@@ -117,6 +121,14 @@
 # typedef bool (*ggml_abort_callback)(void * data);
 ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)
 
+# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+ggml_log_callback = ctypes.CFUNCTYPE(
+    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
+)
+
+# typedef struct ggml_threadpool * ggml_threadpool_t;
+ggml_threadpool_t = ctypes.c_void_p
+
 # llama.h bindings
 
 _lib.llama_max_devices.argtypes = []
@@ -177,6 +189,13 @@
 # typedef int32_t llama_seq_id;
 llama_seq_id = ctypes.c_int32
 
+# typedef uint32_t llama_state_seq_flags;
+llama_state_seq_flags = ctypes.c_uint32
+
+# State sequence flags
+LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
+LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 2
+
 
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
@@ -294,6 +313,7 @@
 LLAMA_ROPE_TYPE_NORM = 0
 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2
 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8
+LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40
 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24
 
 
@@ -462,6 +482,14 @@
 LLAMA_ATTENTION_TYPE_CAUSAL = 0
 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
 
+# enum llama_flash_attn_type {
+#     LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+#     LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+#     LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+LLAMA_FLASH_ATTN_TYPE_AUTO = -1
+LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
 
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
@@ -472,6 +500,15 @@
 LLAMA_SPLIT_MODE_LAYER = 1
 LLAMA_SPLIT_MODE_ROW = 2
 
+# enum llama_params_fit_status {
+#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
+#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
+#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
+# };
+LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
+LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
+LLAMA_PARAMS_FIT_STATUS_ERROR = 2
+
 
 # typedef struct llama_token_data {
 #     llama_token id; // token id
@@ -559,6 +596,7 @@ class llama_token_data_array(ctypes.Structure):
 # typedef struct llama_batch {
 #     int32_t n_tokens;
 
+
 #     llama_token  *  token;
 #     float        *  embd;
 #     llama_pos    *  pos;
@@ -613,6 +651,22 @@ class llama_batch(ctypes.Structure):
 LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
 LLAMA_KV_OVERRIDE_TYPE_STR = 3
 
+# enum llama_model_meta_key {
+#     LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+#     ...
+# };
+LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2
+LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5
+LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11
 
 # struct llama_model_kv_override {
 #     enum llama_model_kv_override_type tag;
@@ -688,6 +742,7 @@ class llama_model_kv_override(ctypes.Structure):
 #     // override key-value pairs of the model meta data
 #     const struct llama_model_kv_override * kv_overrides;
 
+
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only;    // only load the vocabulary, no weights
 #     bool use_mmap;      // use mmap if possible
@@ -716,7 +771,9 @@ class llama_model_params(ctypes.Structure):
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
-        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
+        tensor_buft_overrides: CtypesArray[
+            llama_model_tensor_buft_override
+        ]  # NOTE: unused
         n_gpu_layers: int
         split_mode: int
         main_gpu: int
@@ -731,8 +788,8 @@ class llama_model_params(ctypes.Structure):
         use_extra_bufts: bool
 
     _fields_ = [
-        ("devices", ctypes.c_void_p), # NOTE: unnused
-        ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
+        ("devices", ctypes.c_void_p),  # NOTE: unnused
+        ("tensor_buft_overrides", ctypes.c_void_p),  # NOTE: unused
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -745,6 +802,8 @@ class llama_model_params(ctypes.Structure):
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
         ("use_extra_bufts", ctypes.c_bool),
+        ("no_host", ctypes.c_bool),
+        ("no_alloc", ctypes.c_bool),
     ]
 
 
@@ -784,6 +843,7 @@ class llama_model_params(ctypes.Structure):
 #     ggml_abort_callback abort_callback;
 #     void *              abort_callback_data;
 
+
 #     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
@@ -826,7 +886,6 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
-        flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
         op_offload (bool): offload host tensor operations to device
         swa_full (bool): use full-size SWA cache
@@ -843,6 +902,7 @@ class llama_context_params(ctypes.Structure):
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
+        flash_attn_type: int
         rope_freq_base: float
         rope_freq_scale: float
         yarn_ext_factor: float
@@ -859,7 +919,6 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
-        flash_attn: bool
         no_perf: bool
         op_offload: bool
         swa_full: bool
@@ -875,6 +934,7 @@ class llama_context_params(ctypes.Structure):
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
+        ("flash_attn_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -891,7 +951,6 @@ class llama_context_params(ctypes.Structure):
         ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
-        ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
@@ -1137,8 +1196,7 @@ def llama_backend_free():
     [ctypes.c_int],
     None,
 )
-def llama_numa_init(numa: int, /):
-    ...
+def llama_numa_init(numa: int, /): ...
 
 
 # // Optional: an auto threadpool gets created in ggml if not passed explicitly
@@ -1146,11 +1204,26 @@ def llama_numa_init(numa: int, /):
 #         struct llama_context * ctx,
 #            ggml_threadpool_t   threadpool,
 #            ggml_threadpool_t   threadpool_batch);
-# TODO: Add llama_attach_threadpool
+@ctypes_function(
+    "llama_attach_threadpool",
+    [llama_context_p_ctypes, ggml_threadpool_t, ggml_threadpool_t],
+    None,
+)
+def llama_attach_threadpool(
+    ctx: llama_context_p,
+    threadpool: ctypes.c_void_p,
+    threadpool_batch: ctypes.c_void_p,
+    /,
+):
+    """Attach threadpools to context"""
+    ...
 
 
 # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-# TODO: Add llama_detach_threadpool
+@ctypes_function("llama_detach_threadpool", [llama_context_p_ctypes], None)
+def llama_detach_threadpool(ctx: llama_context_p, /):
+    """Detach threadpool from context"""
+    ...
 
 
 # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1164,8 +1237,7 @@ def llama_numa_init(numa: int, /):
 )
 def llama_load_model_from_file(
     path_model: bytes, params: llama_model_params, /
-) -> Optional[llama_model_p]:
-    ...
+) -> Optional[llama_model_p]: ...
 
 
 # // Load the model from a file
@@ -1230,8 +1302,7 @@ def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
     [llama_model_p_ctypes],
     None,
 )
-def llama_free_model(model: llama_model_p, /):
-    ...
+def llama_free_model(model: llama_model_p, /): ...
 
 
 # LLAMA_API void llama_model_free(struct llama_model * model);
@@ -1240,8 +1311,7 @@ def llama_free_model(model: llama_model_p, /):
     [llama_model_p_ctypes],
     None,
 )
-def llama_model_free(model: llama_model_p, /):
-    ...
+def llama_model_free(model: llama_model_p, /): ...
 
 
 # LLAMA_API struct llama_context * llama_init_from_model(
@@ -1254,8 +1324,7 @@ def llama_model_free(model: llama_model_p, /):
 )
 def llama_init_from_model(
     model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
-    ...
+) -> Optional[llama_context_p]: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -1269,8 +1338,7 @@ def llama_init_from_model(
 )
 def llama_new_context_with_model(
     model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
-    ...
+) -> Optional[llama_context_p]: ...
 
 
 # // Frees all allocated memory
@@ -1291,104 +1359,150 @@ def llama_free(ctx: llama_context_p, /):
     [],
     ctypes.c_int64,
 )
-def llama_time_us() -> int:
-    ...
+def llama_time_us() -> int: ...
 
 
 # LLAMA_API size_t llama_max_devices(void);
 @ctypes_function("llama_max_devices", [], ctypes.c_size_t)
-def llama_max_devices() -> int:
-    ...
+def llama_max_devices() -> int: ...
 
 
 # LLAMA_API size_t llama_max_parallel_sequences(void);
 @ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t)
-def llama_max_parallel_sequences() -> int:
-    ...
+def llama_max_parallel_sequences() -> int: ...
 
 
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
-def llama_supports_mmap() -> bool:
-    ...
+def llama_supports_mmap() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_mlock      (void);
 @ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
-def llama_supports_mlock() -> bool:
-    ...
+def llama_supports_mlock() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_gpu_offload(void);
 @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
-def llama_supports_gpu_offload() -> bool:
-    ...
+def llama_supports_gpu_offload() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_rpc        (void);
 @ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
-def llama_supports_rpc() -> bool:
+def llama_supports_rpc() -> bool: ...
+
+
+# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
+@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
+def llama_max_tensor_buft_overrides() -> int:
+    """Get maximum number of tensor buffer type overrides"""
+    ...
+
+
+# LLAMA_API enum llama_params_fit_status llama_params_fit(
+#                                const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                       float * tensor_split,
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
+#                                      size_t   margin,
+#                                    uint32_t   n_ctx_min,
+#                         enum ggml_log_level   log_level);
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        ctypes.POINTER(llama_model_params),
+        ctypes.POINTER(llama_context_params),
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_void_p,  # tensor_buft_overrides - not fully bound
+        ctypes.c_size_t,  # margin
+        ctypes.c_uint32,  # n_ctx_min
+        ctypes.c_int,  # ggml_log_level (enum)
+    ],
+    ctypes.c_int,
+)
+def llama_params_fit(
+    path_model: bytes,
+    mparams: CtypesPointerOrRef[llama_model_params],
+    cparams: CtypesPointerOrRef[llama_context_params],
+    tensor_split: CtypesArray[ctypes.c_float],
+    tensor_buft_overrides: Optional[ctypes.c_void_p],
+    margin: Union[ctypes.c_size_t, int],
+    n_ctx_min: Union[ctypes.c_uint32, int],
+    log_level: int,
+    /,
+) -> int:
+    """Check if model parameters will fit in memory
+
+    Args:
+        margin: Memory margin to leave per device in bytes
+        n_ctx_min: Minimum context size when trying to reduce memory
+        log_level: Minimum log level (ggml_log_level enum)
+
+    Returns:
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS (0) - found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE (1) - could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR (2) - a hard error occurred
+    """
     ...
 
 
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ctx(ctx: llama_context_p, /) -> int:
+def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
+
+
+# LLAMA_API uint32_t llama_n_ctx_seq(const struct llama_context * ctx);
+@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_ctx_seq(ctx: llama_context_p, /) -> int:
+    """Get the context sequence size"""
     ...
 
 
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_batch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_batch(ctx: llama_context_p, /) -> int: ...
 
 
 # LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
 @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ubatch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 
 
 # LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_seq_max(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_ctx_train(model: llama_model_p, /) -> int:
-    ...
+def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
 @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_embd(model: llama_model_p, /) -> int:
-    ...
+def llama_n_embd(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
 @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_layer(model: llama_model_p, /) -> int:
-    ...
+def llama_n_layer(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_head(model: llama_model_p, /) -> int:
-    ...
+def llama_n_head(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 @ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_vocab_p, /) -> int:
-    ...
+def llama_n_vocab(model: llama_vocab_p, /) -> int: ...
 
 
 # LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
-    ...
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
 
 
 # LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
@@ -1400,74 +1514,63 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]:
 
 # LLAMA_API  enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
-def llama_pooling_type(ctx: llama_context_p, /) -> int:
-    ...
+def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-@ctypes_function(
-    "llama_get_kv_self",
-    [llama_context_p_ctypes],
-    llama_kv_cache_p_ctypes,
-)
-def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
-    """Get the KV cache for self-attention (DEPRECATED)"""
-    ...
-
-
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
-def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
-    ...
+def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ...
 
 
 # LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
 @ctypes_function("llama_model_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_model_rope_type(model: llama_model_p, /) -> int:
-    ...
+def llama_model_rope_type(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
 @ctypes_function("llama_model_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_ctx_train(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
 @ctypes_function("llama_model_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_embd(model: llama_model_p, /) -> int:
+def llama_model_n_embd(model: llama_model_p, /) -> int: ...
+
+
+# LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_inp(model: llama_model_p, /) -> int:
+    """Get the input embedding dimension"""
     ...
 
 
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_layer(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_layer(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_head(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_head(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_head_kv(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_head_kv(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 @ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_swa(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_swa(model: llama_model_p, /) -> int: ...
 
 
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
-@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
-def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float:
-    ...
+@ctypes_function(
+    "llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float
+)
+def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: ...
 
 
 # // Returns the number of classifier outputs (only valid for classifier models)
@@ -1481,7 +1584,9 @@ def llama_model_n_cls_out(model: llama_model_p, /) -> int:
 
 # // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
 # LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
-@ctypes_function("llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p)
+@ctypes_function(
+    "llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p
+)
 def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
     """Returns label of classifier output by index. Returns None if no label provided"""
     ...
@@ -1489,14 +1594,12 @@ def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
 
 # LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
 @ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int)
-def llama_vocab_type(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_vocab_type(vocab: llama_vocab_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 @ctypes_function("llama_vocab_n_tokens", [llama_vocab_p_ctypes], ctypes.c_int32)
-def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: ...
 
 
 # // Functions to access the model's GGUF metadata scalar values
@@ -1611,8 +1714,14 @@ def llama_model_size(model: llama_model_p, /) -> int:
 # // Get the default chat template. Returns nullptr if not available
 # // If name is NULL, returns the default chat template
 # LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
-@ctypes_function("llama_model_chat_template", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_char_p)
-def llama_model_chat_template(model: llama_model_p, name: Optional[bytes], /) -> Optional[bytes]:
+@ctypes_function(
+    "llama_model_chat_template",
+    [llama_model_p_ctypes, ctypes.c_char_p],
+    ctypes.c_char_p,
+)
+def llama_model_chat_template(
+    model: llama_model_p, name: Optional[bytes], /
+) -> Optional[bytes]:
     """Get the default chat template. Returns None if not available
     If name is None, returns the default chat template"""
     ...
@@ -1663,6 +1772,13 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
+@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_hybrid(model: llama_model_p, /) -> bool:
+    """Returns true if model is hybrid (Jamba, Granite, etc.)"""
+    ...
+
+
 # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
 # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
@@ -1699,6 +1815,7 @@ def llama_model_quantize(
 # // Adapters
 # //
 
+
 # // Load a LoRA adapter from file
 # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 #         struct llama_model * model,
@@ -1710,8 +1827,7 @@ def llama_model_quantize(
 )
 def llama_adapter_lora_init(
     model: llama_model_p, path_lora: bytes, /
-) -> Optional[llama_adapter_lora_p]:
-    ...
+) -> Optional[llama_adapter_lora_p]: ...
 
 
 # // Manually free a LoRA adapter
@@ -1722,7 +1838,80 @@ def llama_adapter_lora_init(
     [llama_adapter_lora_p_ctypes],
     None,
 )
-def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
+def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_val_str",
+    [llama_adapter_lora_p_ctypes, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str(
+    adapter: llama_adapter_lora_p, key: bytes, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value as string"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32
+)
+def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int:
+    """Get number of adapter metadata pairs"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_key_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_key_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_key_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata key by index"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_val_str_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value by index"""
+    ...
+
+
+# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_n_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.c_uint64,
+)
+def llama_adapter_get_alora_n_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> int:
+    """Get alora invocation token count"""
+    ...
+
+
+# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.POINTER(llama_token),
+)
+def llama_adapter_get_alora_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> ctypes.Array:
+    """Get alora invocation tokens"""
     ...
 
 
@@ -1825,6 +2014,7 @@ def llama_apply_adapter_cvec(
 # // Memory
 # //
 
+
 # // Clear the memory contents
 # // If data == true, the data buffers will also be cleared together with the metadata
 # LLAMA_API void llama_memory_clear(
@@ -1916,9 +2106,7 @@ def llama_memory_seq_cp(
 # LLAMA_API void llama_memory_seq_keep(
 #         llama_memory_t mem,
 #           llama_seq_id seq_id);
-@ctypes_function(
-    "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None
-)
+@ctypes_function("llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None)
 def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
     ...
@@ -2038,260 +2226,11 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
 
 # //
 # // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-# //
-
-# // Returns the number of tokens in the KV cache (slow, use only for debug)
-# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
-    ...
-
-
-# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (DEPRECATED)"""
-    ...
-
-
-# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# DEPRECATED(LLAMA_API void llama_kv_self_clear(
-#             struct llama_context * ctx),
-#         "Use llama_memory_clear() instead");
-@ctypes_function(
-    "llama_kv_self_clear", [llama_context_p_ctypes], None
-)
-def llama_kv_self_clear(ctx: llama_context_p, /):
-    """Clear the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-# // seq_id < 0 : match any sequence
-# // p0 < 0     : [0,  p1]
-# // p1 < 0     : [p0, inf)
-# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_rm() instead");
-@ctypes_function(
-    "llama_kv_self_seq_rm",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    ctypes.c_bool,
-)
-def llama_kv_self_seq_rm(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-) -> bool:
-    """Remove tokens from KV cache (DEPRECATED)"""
-    ...
-
-
-# // Copy all tokens that belong to the specified sequence to another sequence
-# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id_src,
-#                 llama_seq_id   seq_id_dst,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_cp() instead");
-@ctypes_function(
-    "llama_kv_self_seq_cp",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_cp(
-    ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-):
-    """Copy tokens in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that do not belong to the specified sequence
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_keep() instead");
-@ctypes_function(
-    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
-)
-def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Keep only specified sequence in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                    llama_pos   delta),
-#         "Use llama_memory_seq_add() instead");
-@ctypes_function(
-    "llama_kv_self_seq_add",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_add(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
-    /,
-):
-    """Add delta to sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                          int   d),
-#         "Use llama_memory_seq_div() instead");
-@ctypes_function(
-    "llama_kv_self_seq_div",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        ctypes.c_int,
-    ],
-    None,
-)
-def llama_kv_self_seq_div(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
-    /,
-):
-    """Divide sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Returns the smallest position present in the KV cache for the specified sequence
-# // This is typically non-zero only for SWA caches
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_min() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_min(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Returns the largest position present in the KV cache for the specified sequence
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_max() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_max(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Defragment the KV cache
-# // This will be applied:
-# //   - lazily on next llama_decode()
-# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
-def llama_kv_self_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Check if the context supports KV cache shifting
-# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
-#         "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting (DEPRECATED)"""
-    ...
-
-
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-#         "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
-def llama_kv_self_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (DEPRECATED)"""
-    ...
-
-
 # //
 # // State / sessions
 # //
 
+
 # // Returns the *actual* size in bytes of the state
 # // (logits, embedding and memory)
 # // Only use when saving the state, not when restoring it, otherwise the size may be too small.
@@ -2420,8 +2359,7 @@ def llama_state_load_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API DEPRECATED(bool llama_load_session_file(
@@ -2449,8 +2387,7 @@ def llama_load_session_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API bool llama_state_save_file(
@@ -2474,8 +2411,7 @@ def llama_state_save_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API DEPRECATED(bool llama_save_session_file(
@@ -2500,8 +2436,7 @@ def llama_save_session_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # // Get the exact size needed to copy the state of a single sequence
@@ -2599,8 +2534,7 @@ def llama_state_seq_save_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> int:
-    ...
+) -> int: ...
 
 
 # LLAMA_API size_t llama_state_seq_load_file(
@@ -2630,7 +2564,83 @@ def llama_state_seq_load_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
+) -> int: ...
+
+
+# LLAMA_API size_t llama_state_seq_get_size_ext(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_size_ext",
+    [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_size_ext(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Get size needed to copy sequence state with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_get_data_ext(
+#         struct llama_context * ctx,
+#                      uint8_t * dst,
+#                       size_t   size,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_data_ext(
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Copy sequence state to buffer with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_set_data_ext(
+#         struct llama_context * ctx,
+#                const uint8_t * src,
+#                       size_t   size,
+#                 llama_seq_id   dest_seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_set_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_set_data_ext(
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
 ) -> int:
+    """Restore sequence state from buffer with flags"""
     ...
 
 
@@ -2638,6 +2648,7 @@ def llama_state_seq_load_file(
 # // Decoding
 # //
 
+
 # // Return batch for single sequence of tokens
 # // The sequence ID will be fixed to 0
 # // The position of the tokens will be tracked automatically by llama_decode
@@ -2947,14 +2958,14 @@ def llama_get_embeddings_seq(
 # // Vocab
 # //
 
+
 # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
     "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
 )
 def llama_vocab_get_text(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bytes:
-    ...
+) -> bytes: ...
 
 
 # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
@@ -2963,8 +2974,7 @@ def llama_vocab_get_text(
 )
 def llama_vocab_get_score(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> float:
-    ...
+) -> float: ...
 
 
 # LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
@@ -2973,8 +2983,7 @@ def llama_vocab_get_score(
 )
 def llama_vocab_get_attr(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> int:
-    ...
+) -> int: ...
 
 
 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
@@ -3055,8 +3064,7 @@ def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -3065,8 +3073,7 @@ def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
@@ -3075,8 +3082,7 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
@@ -3085,8 +3091,7 @@ def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -3095,8 +3100,7 @@ def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
@@ -3105,8 +3109,7 @@ def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
@@ -3115,8 +3118,7 @@ def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
@@ -3125,8 +3127,7 @@ def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
@@ -3135,8 +3136,7 @@ def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # DEPRECATED functions
@@ -3148,8 +3148,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
 )
 def llama_token_get_text(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bytes:
-    ...
+) -> bytes: ...
 
 
 # DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
@@ -3160,8 +3159,8 @@ def llama_token_get_text(
 )
 def llama_token_get_score(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> float:
-    ...
+) -> float: ...
+
 
 # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
 @ctypes_function(
@@ -3171,8 +3170,8 @@ def llama_token_get_score(
 )
 def llama_token_get_attr(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> int:
-    ...
+) -> int: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
 @ctypes_function(
@@ -3182,8 +3181,8 @@ def llama_token_get_attr(
 )
 def llama_token_is_eog(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bool:
-    ...
+) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
 @ctypes_function(
@@ -3193,8 +3192,8 @@ def llama_token_is_eog(
 )
 def llama_token_is_control(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bool:
-    ...
+) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
 @ctypes_function(
@@ -3202,8 +3201,8 @@ def llama_token_is_control(
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_bos(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_bos(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
 @ctypes_function(
@@ -3211,8 +3210,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_eos(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_eos(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
 @ctypes_function(
@@ -3220,8 +3219,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_eot(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_eot(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
 @ctypes_function(
@@ -3229,8 +3228,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_cls(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_cls(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
 @ctypes_function(
@@ -3238,8 +3237,7 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_sep(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_sep(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
@@ -3248,8 +3246,7 @@ def llama_token_sep(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_nl(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_nl(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
@@ -3258,8 +3255,7 @@ def llama_token_nl(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_pad(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_pad(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
@@ -3268,8 +3264,8 @@ def llama_token_pad(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
 @ctypes_function(
@@ -3277,8 +3273,7 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
@@ -3287,8 +3282,8 @@ def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
 @ctypes_function(
@@ -3296,8 +3291,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
 @ctypes_function(
@@ -3305,8 +3300,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
 @ctypes_function(
@@ -3314,8 +3309,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
 @ctypes_function(
@@ -3323,8 +3318,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
 @ctypes_function(
@@ -3332,8 +3327,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # // CLS is equivalent to BOS
 # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
@@ -3343,8 +3338,7 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # //
@@ -3353,6 +3347,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
 # // The API is thread-safe.
 # //
 
+
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 # /// @return Returns the number of tokens on success, no more than n_tokens_max
@@ -3512,6 +3507,7 @@ def llama_detokenize(
 # // Chat templates
 # //
 
+
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
 # /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -3535,9 +3531,9 @@ def llama_detokenize(
         ctypes.c_char_p,  # tmpl
         ctypes.POINTER(llama_chat_message),  # chat
         ctypes.c_size_t,  # n_msg
-        ctypes.c_bool,    # add_ass (added)
+        ctypes.c_bool,  # add_ass (added)
         ctypes.c_char_p,  # buf
-        ctypes.c_int32,   # length
+        ctypes.c_int32,  # length
     ],
     ctypes.c_int32,
 )
@@ -3611,11 +3607,11 @@ def llama_chat_builtin_templates(
 #     struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 #     void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 
+
 #     // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
 #     //void (*apply_ggml) (struct llama_sampler * smpl, ...);
 # };
-class llama_sampler_i(ctypes.Structure):
-    ...
+class llama_sampler_i(ctypes.Structure): ...
 
 
 # struct llama_sampler {
@@ -3662,8 +3658,7 @@ class llama_sampler(ctypes.Structure):
 )
 def llama_sampler_init(
     iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
@@ -3672,8 +3667,7 @@ def llama_sampler_init(
     [llama_sampler_p_ctypes],
     ctypes.c_char_p,
 )
-def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes:
-    ...
+def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: ...
 
 
 # LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
@@ -3682,8 +3676,7 @@ def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes:
     [llama_sampler_p_ctypes, llama_token],
     None,
 )
-def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /):
-    ...
+def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): ...
 
 
 # LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -3694,8 +3687,7 @@ def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int],
 )
 def llama_sampler_apply(
     smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], /
-):
-    ...
+): ...
 
 
 # LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
@@ -3704,8 +3696,7 @@ def llama_sampler_apply(
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_reset(smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_reset(smpl: llama_sampler_p, /): ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
@@ -3714,8 +3705,7 @@ def llama_sampler_reset(smpl: llama_sampler_p, /):
     [llama_sampler_p_ctypes],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p:
-    ...
+def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: ...
 
 
 # // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
@@ -3725,21 +3715,22 @@ def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p:
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_free(smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_free(smpl: llama_sampler_p, /): ...
 
 
 # // llama_sampler_chain
 # // a type of llama_sampler that can chain multiple samplers one after another
 
+
 # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
 @ctypes_function(
     "llama_sampler_chain_init",
     [llama_sampler_chain_params],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sampler_p:
-    ...
+def llama_sampler_chain_init(
+    params: llama_sampler_chain_params, /
+) -> llama_sampler_p: ...
 
 
 # // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
@@ -3749,8 +3740,7 @@ def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sam
     [llama_sampler_p_ctypes, llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
@@ -3761,8 +3751,7 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /):
 )
 def llama_sampler_chain_get(
     chain: llama_sampler_p, i: Union[ctypes.c_int32, int], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
@@ -3771,8 +3760,7 @@ def llama_sampler_chain_get(
     [llama_sampler_p_ctypes],
     ctypes.c_int,
 )
-def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int:
-    ...
+def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ...
 
 
 # // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -3784,39 +3772,33 @@ def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int:
 )
 def llama_sampler_chain_remove(
     chain: llama_sampler_p, i: Union[ctypes.c_int32, int], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # // available samplers:
 
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
-def llama_sampler_init_greedy() -> llama_sampler_p:
-    ...
+def llama_sampler_init_greedy() -> llama_sampler_p: ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
 @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
-def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
 #     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
 @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
-def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_k(k: int) -> llama_sampler_p: ...
 
 
 # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3826,8 +3808,7 @@ def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
@@ -3837,8 +3818,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -3848,15 +3828,13 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
 # LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes)
-def llama_sampler_init_temp(t: float) -> llama_sampler_p:
-    ...
+def llama_sampler_init_temp(t: float) -> llama_sampler_p: ...
 
 
 # /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
@@ -3868,8 +3846,7 @@ def llama_sampler_init_temp(t: float) -> llama_sampler_p:
 )
 def llama_sampler_init_temp_ext(
     t: float, delta: float, exponent: float
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
@@ -3881,8 +3858,7 @@ def llama_sampler_init_temp_ext(
 )
 def llama_sampler_init_xtc(
     p: float, t: float, min_keep: int, seed: int, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
@@ -3892,8 +3868,7 @@ def llama_sampler_init_xtc(
     [ctypes.c_float],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ...
 
 
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3910,8 +3885,7 @@ def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
 )
 def llama_sampler_init_mirostat(
     n_vocab: int, seed: int, tau: float, eta: float, m: int, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3926,8 +3900,7 @@ def llama_sampler_init_mirostat(
 )
 def llama_sampler_init_mirostat_v2(
     seed: int, tau: float, eta: float, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Intializes a GBNF grammar, see grammars/README.md for details.
@@ -3942,8 +3915,7 @@ def llama_sampler_init_mirostat_v2(
 )
 def llama_sampler_init_grammar(
     vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -3977,8 +3949,7 @@ def llama_sampler_init_grammar_lazy(
     trigger_tokens: CtypesArray[llama_token],
     num_trigger_tokens: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
@@ -4012,8 +3983,7 @@ def llama_sampler_init_grammar_lazy_patterns(
     trigger_tokens: CtypesArray[llama_token],
     num_trigger_tokens: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
@@ -4033,8 +4003,7 @@ def llama_sampler_init_penalties(
     penalty_freq: float,
     penalty_present: float,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
@@ -4071,8 +4040,7 @@ def llama_sampler_init_dry(
     seq_breakers,
     num_breakers: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
@@ -4086,8 +4054,7 @@ def llama_sampler_init_dry(
 )
 def llama_sampler_init_logit_bias(
     n_vocab: int, n_logit_bias: int, logit_bias: CtypesArray[llama_logit_bias], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # // this sampler is meant to be used for fill-in-the-middle infilling
@@ -4097,8 +4064,7 @@ def llama_sampler_init_logit_bias(
     [llama_vocab_p_ctypes],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p:
-    ...
+def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: ...
 
 
 # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
@@ -4108,8 +4074,7 @@ def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p:
     [llama_sampler_p_ctypes],
     ctypes.c_uint32,
 )
-def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
-    ...
+def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ...
 
 
 # /// @details Sample and accept a token from the idx-th output of the last evaluation
@@ -4121,14 +4086,14 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
 )
 def llama_sampler_sample(
     smpl: llama_sampler_p, ctx: llama_context_p, idx: int, /
-) -> int:
-    ...
+) -> int: ...
 
 
 # //
 # // Model split
 # //
 
+
 # /// @details Build a split GGUF final path for this chunk.
 # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
 @ctypes_function(
@@ -4170,16 +4135,34 @@ def llama_split_prefix(
 # // Print system information
 # LLAMA_API const char * llama_print_system_info(void);
 @ctypes_function("llama_print_system_info", [], ctypes.c_char_p)
-def llama_print_system_info() -> bytes:
+def llama_print_system_info() -> bytes: ...
+
+
+# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type type);
+@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p)
+def llama_flash_attn_type_name(type: int, /) -> bytes:
+    """Get name of flash attention type"""
+    ...
+
+
+# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p)
+def llama_model_meta_key_str(key: int, /) -> bytes:
+    """Get string representation of model meta key"""
+    ...
+
+
+# LLAMA_API ggml_log_callback llama_log_get(void);
+@ctypes_function("llama_log_get", [], ggml_log_callback)
+def llama_log_get() -> ggml_log_callback:
+    """Get current log callback"""
     ...
 
 
-# // Set callback for all future logging events.
-# // If this is not called, or NULL is supplied, everything is output on stderr.
 # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "llama_log_set",
-    [ctypes.c_void_p, ctypes.c_void_p],
+    [ggml_log_callback, ctypes.c_void_p],
     None,
 )
 def llama_log_set(
@@ -4193,7 +4176,17 @@ def llama_log_set(
     ...
 
 
-# //
+# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_memory_breakdown_print",
+    [llama_context_p_ctypes],
+    None,
+)
+def llama_memory_breakdown_print(ctx: llama_context_p, /):
+    """Print memory breakdown for context"""
+    ...
+
+
 # // Performance utils
 # //
 
@@ -4203,6 +4196,7 @@ def llama_log_set(
 #     double t_p_eval_ms;
 #     double t_eval_ms;
 
+
 #     int32_t n_p_eval;
 #     int32_t n_eval;
 #     int32_t n_reused; // number of times a ggml compute graph had been reused
@@ -4222,6 +4216,7 @@ class llama_perf_context_data(ctypes.Structure):
 # struct llama_perf_sampler_data {
 #     double t_sample_ms;
 
+
 #     int32_t n_sample;
 # };
 class llama_perf_sampler_data(ctypes.Structure):
@@ -4237,8 +4232,7 @@ class llama_perf_sampler_data(ctypes.Structure):
     [llama_context_p_ctypes],
     llama_perf_context_data,
 )
-def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data:
-    ...
+def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: ...
 
 
 # LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
@@ -4247,8 +4241,7 @@ def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data:
     [llama_context_p_ctypes],
     None,
 )
-def llama_perf_context_print(ctx: llama_context_p, /):
-    ...
+def llama_perf_context_print(ctx: llama_context_p, /): ...
 
 
 # LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
@@ -4257,8 +4250,7 @@ def llama_perf_context_print(ctx: llama_context_p, /):
     [llama_context_p_ctypes],
     None,
 )
-def llama_perf_context_reset(ctx: llama_context_p, /):
-    ...
+def llama_perf_context_reset(ctx: llama_context_p, /): ...
 
 
 # // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
@@ -4268,8 +4260,7 @@ def llama_perf_context_reset(ctx: llama_context_p, /):
     [llama_sampler_p_ctypes],
     llama_perf_sampler_data,
 )
-def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data:
-    ...
+def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: ...
 
 
 # LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
@@ -4278,8 +4269,7 @@ def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data:
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_perf_sampler_print(chain: llama_sampler_p, /):
-    ...
+def llama_perf_sampler_print(chain: llama_sampler_p, /): ...
 
 
 # LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
@@ -4288,8 +4278,7 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /):
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_perf_sampler_reset(chain: llama_sampler_p, /):
-    ...
+def llama_perf_sampler_reset(chain: llama_sampler_p, /): ...
 
 
 # //
@@ -4298,7 +4287,10 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
 
 # // function that returns whether or not a given tensor contains trainable parameters
 # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
-llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
+llama_opt_param_filter = ctypes.CFUNCTYPE(
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p
+)
+
 
 # // always returns true
 # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
@@ -4307,8 +4299,9 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     [ctypes.c_void_p, ctypes.c_void_p],
     ctypes.c_bool,
 )
-def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool:
-    ...
+def llama_opt_param_filter_all(
+    tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /
+) -> bool: ...
 
 
 # struct llama_opt_params {
@@ -4317,6 +4310,7 @@ def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_
 #     llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
 #     void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
 
+
 #     ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
 #     void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
 # };
@@ -4325,7 +4319,10 @@ class llama_opt_params(ctypes.Structure):
         ("n_ctx_train", ctypes.c_uint32),
         ("param_filter", llama_opt_param_filter),
         ("param_filter_ud", ctypes.c_void_p),
-        ("get_opt_pars", ctypes.c_void_p),  # ggml_opt_get_optimizer_params - not implemented here
+        (
+            "get_opt_pars",
+            ctypes.c_void_p,
+        ),  # ggml_opt_get_optimizer_params - not implemented here
         ("get_opt_pars_ud", ctypes.c_void_p),
     ]
 
@@ -4336,8 +4333,9 @@ class llama_opt_params(ctypes.Structure):
     [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params],
     None,
 )
-def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /):
-    ...
+def llama_opt_init(
+    lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /
+): ...
 
 
 # LLAMA_API void llama_opt_epoch(
@@ -4353,7 +4351,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla
     [
         llama_context_p_ctypes,
         ctypes.c_void_p,  # ggml_opt_dataset_t
-        ctypes.c_void_p,  # ggml_opt_result_t  
+        ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_int64,
         ctypes.c_void_p,  # ggml_opt_epoch_callback
@@ -4370,5 +4368,4 @@ def llama_opt_epoch(
     callback_train: ctypes.c_void_p,
     callback_eval: ctypes.c_void_p,
     /,
-):
-    ...
+): ...
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index a45f8f4..e4cae59 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -39,7 +39,11 @@
 # Specify the base name of the shared library to load
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
-_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+_libmtmd_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _libmtmd_override_path is None
+    else pathlib.Path(_libmtmd_override_path)
+)
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
@@ -77,9 +81,12 @@ class mtmd_context_params(Structure):
         ("use_gpu", c_bool),
         ("print_timings", c_bool),
         ("n_threads", c_int),
-        ("verbosity", c_int),  # ggml_log_level
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
+        ("flash_attn_type", c_int),  # enum llama_flash_attn_type
+        ("warmup", c_bool),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
     ]
 
 class mtmd_input_text(Structure):
@@ -109,7 +116,7 @@ def mtmd_context_params_default() -> mtmd_context_params:
 @ctypes_function(
     "mtmd_init_from_file",
     [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
-    mtmd_context_p_ctypes
+    mtmd_context_p_ctypes,
 )
 def mtmd_init_from_file(
     mmproj_fname: bytes,
@@ -131,9 +138,7 @@ def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
 
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
 @ctypes_function(
-    "mtmd_bitmap_init",
-    [c_uint32, c_uint32, POINTER(c_uint8)],
-    mtmd_bitmap_p_ctypes
+    "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes
 )
 def mtmd_bitmap_init(
     nx: Union[c_uint32, int],
@@ -167,7 +172,7 @@ def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
 @ctypes_function(
     "mtmd_input_chunks_get",
     [mtmd_input_chunks_p_ctypes, c_size_t],
-    mtmd_input_chunk_p_ctypes
+    mtmd_input_chunk_p_ctypes,
 )
 def mtmd_input_chunks_get(
     chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
@@ -214,7 +219,7 @@ def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
 @ctypes_function(
     "mtmd_input_chunk_get_tokens_text",
     [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
-    POINTER(llama_cpp.llama_token)
+    POINTER(llama_cpp.llama_token),
 )
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c9512..bad0d4e 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -103,8 +103,9 @@ class ModelSettings(BaseSettings):
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
     )
-    flash_attn: bool = Field(
-        default=False, description="Whether to use flash attention."
+    flash_attn: Optional[bool] = Field(
+        default=None,
+        description="Use flash attention. None=auto, True=enabled, False=disabled.",
     )
     # Sampling Params
     last_n_tokens_size: int = Field(
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 0a1a9f5..f76190d 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -82,7 +82,7 @@ def test_real_model(llama_cpp_model_path):
     cparams.n_threads = multiprocessing.cpu_count()
     cparams.n_threads_batch = multiprocessing.cpu_count()
     cparams.logits_all = False
-    cparams.flash_attn = True
+    cparams.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
 
     context = internals.LlamaContext(model=model, params=cparams)
     tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9b..be47fb9 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit be47fb9285779e900915bd8246eb9664110d4ba5