diff --git a/.gitattributes b/.gitattributes
index 42076ebfce6..386b9e88c90 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -49,6 +49,7 @@ sconstruct text diff=python
*.c text diff=c
*.cpp text diff=cpp
*.h text diff=c
+*.hpp text diff=cpp
*.idl text diff=c
*.acf text diff=c
diff --git a/.github/workflows/testAndPublish.yml b/.github/workflows/testAndPublish.yml
index ac23f3154a6..a1253c9539e 100644
--- a/.github/workflows/testAndPublish.yml
+++ b/.github/workflows/testAndPublish.yml
@@ -78,7 +78,7 @@ jobs:
- name: Checkout NVDA
uses: actions/checkout@v6
with:
- submodules: true
+ submodules: recursive
- name: Install Python
uses: actions/setup-python@v6
with:
diff --git a/.gitignore b/.gitignore
index 263e375fdfe..1981646f2be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ source/lib
source/lib64
source/typelibs
source/louis
+source/cppjieba
*.obj
*.exp
*.lib
diff --git a/.gitmodules b/.gitmodules
index 0fa0d910a9e..9791fec7baf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -42,3 +42,6 @@
[submodule "include/nvda-mathcat"]
path = include/nvda-mathcat
url = https://github.com/nvaccess/nvda-mathcat.git
+[submodule "include/cppjieba"]
+ path = include/cppjieba
+ url = https://github.com/yanyiwu/cppjieba
diff --git a/copying.txt b/copying.txt
index 6b33fdaacec..014d8f7e9f9 100644
--- a/copying.txt
+++ b/copying.txt
@@ -1027,6 +1027,7 @@ In addition to these dependencies, the following are also included in NVDA:
* Microsoft Detours: MIT
* Python: PSF
* NSIS: zlib/libpng
+* cppjieba: MIT
Furthermore, NVDA also utilises some static/binary dependencies, details of which can be found at the following URL:
diff --git a/include/cppjieba b/include/cppjieba
new file mode 160000
index 00000000000..9408c1d08fa
--- /dev/null
+++ b/include/cppjieba
@@ -0,0 +1 @@
+Subproject commit 9408c1d08facc6e324dc90260e8cb20ecceebf70
diff --git a/include/readme.md b/include/readme.md
index 12f09af1208..23bdfc8a227 100644
--- a/include/readme.md
+++ b/include/readme.md
@@ -61,3 +61,10 @@ Used in chrome system tests.
Fetch latest from master.
+
+### cppjieba
+
+[cppjieba](https://github.com/yanyiwu/cppjieba).
+
+Fetch latest from master.
+Used for Chinese text segmentation.
diff --git a/nvdaHelper/archBuild_sconscript b/nvdaHelper/archBuild_sconscript
index 22f73cd893a..a39e3b437d6 100644
--- a/nvdaHelper/archBuild_sconscript
+++ b/nvdaHelper/archBuild_sconscript
@@ -226,6 +226,10 @@ Export("detoursLib")
apiHookObj = env.Object("apiHook", "common/apiHook.cpp")
Export("apiHookObj")
+cppjiebaLib = env.SConscript("cppjieba/sconscript")
+Export("cppjiebaLib")
+env.Install(libInstallDir, cppjiebaLib)
+
localLib = env.SConscript("local/sconscript")
Export("localLib")
if signExec:
diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp
new file mode 100644
index 00000000000..d63aa1f130c
--- /dev/null
+++ b/nvdaHelper/cppjieba/cppjieba.cpp
@@ -0,0 +1,124 @@
+/*
+A part of NonVisual Desktop Access (NVDA)
+Copyright (C) 2025 NV Access Limited, Wang Chong
+This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+*/
+
+#include "cppjieba.hpp"
+
+
+using namespace std;
+
+// static members for singleton bookkeeping
+JiebaSingleton* JiebaSingleton::instance = nullptr;
+std::once_flag JiebaSingleton::initFlag;
+
+JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) {
+ // convert incoming C-string+length to std::string (handles dictDir == nullptr)
+ std::string dir = dictDir;
+
+ // ensure singleton is constructed exactly once
+ std::call_once(JiebaSingleton::initFlag, [&]() {
+ // allocate on heap, so we avoid copy/move and control lifetime
+ JiebaSingleton::instance = new JiebaSingleton(dir.c_str());
+ // optional: register deleter at exit
+ std::atexit([]() {
+ delete JiebaSingleton::instance;
+ JiebaSingleton::instance = nullptr;
+ });
+ });
+
+ // after call_once, instance must be non-null
+ return *JiebaSingleton::instance;
+}
+
+JiebaSingleton& JiebaSingleton::getInstance() {
+ if (!JiebaSingleton::instance) {
+ throw std::runtime_error("JiebaSingleton::getInstance() called before initialization. Call getInstance(dictDir) or initJieba() first.");
+ }
+ return *JiebaSingleton::instance;
+}
+
+JiebaSingleton::JiebaSingleton(const char* dictDir)
+: cppjieba::JiebaSegmenter(
+ std::string(dictDir),
+ std::string(dictDir),
+ std::string(dictDir)
+ )
+{
+ // base class ctor will load dictionaries/models
+}
+
+void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets) {
+ std::lock_guard lock(segMutex);
+ wordEndOffsets.clear();
+ std::vector words;
+ this->Cut(text, words, true);
+
+ for (const auto& word : words) {
+ wordEndOffsets.push_back(word.unicode_offset + word.unicode_length);
+ }
+}
+
+extern "C" {
+
+bool initJieba(const char* dictDir) {
+ try {
+ // simply force the singleton into existence
+ (void)JiebaSingleton::getInstance(dictDir);
+ return true;
+ } catch (...) {
+ return false;
+ }
+}
+
+bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) {
+ if (!text || !wordEndOffsets || !outLen) return false;
+
+ try {
+ std::string textStr(text);
+ std::vector offs;
+ JiebaSingleton::getInstance().getWordEndOffsets(textStr, offs);
+
+ int n = static_cast(offs.size());
+ if (n == 0) {
+ *wordEndOffsets = nullptr;
+ *outLen = 0;
+ return true; // success, but no offsets
+ }
+
+ int* buf = static_cast(std::malloc(sizeof(int) * n));
+ if (!buf) {
+ *wordEndOffsets = nullptr;
+ *outLen = 0;
+ return false;
+ }
+ for (int i = 0; i < n; ++i) buf[i] = offs[i];
+ *wordEndOffsets = buf;
+ *outLen = n;
+ return true;
+ } catch (...) {
+ *wordEndOffsets = nullptr;
+ *outLen = 0;
+ return false;
+ }
+}
+
+bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) {
+ return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag));
+}
+
+bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) {
+ return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag));
+}
+
+bool find(const char* word) {
+ return JiebaSingleton::getInstance().Find(string(word));
+}
+
+void freeOffsets(int* ptr) {
+ if (ptr) std::free(ptr);
+}
+
+} // extern "C"
diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def
new file mode 100644
index 00000000000..fca4a152027
--- /dev/null
+++ b/nvdaHelper/cppjieba/cppjieba.def
@@ -0,0 +1,8 @@
+LIBRARY cppjieba
+EXPORTS
+ initJieba
+ calculateWordOffsets
+ insertUserWord
+ deleteUserWord
+ find
+ freeOffsets
diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp
new file mode 100644
index 00000000000..13ccf47acc6
--- /dev/null
+++ b/nvdaHelper/cppjieba/cppjieba.hpp
@@ -0,0 +1,165 @@
+/*
+A part of NonVisual Desktop Access (NVDA)
+Copyright (C) 2025 NV Access Limited, Wang Chong
+This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+*/
+
+#ifndef CPPJIEBA_DLL_H
+#define CPPJIEBA_DLL_H
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "QuerySegment.hpp"
+
+using namespace std;
+
+namespace cppjieba { // copied from Jieba.hpp and modified to drop off its keyword extractor we don't use
+
+class JiebaSegmenter {
+ public:
+ JiebaSegmenter(const string& dict_path,
+ const string& model_path,
+ const string& user_dict_path)
+ : dict_trie_(pathJoin(dict_path, "jieba.dict.utf8"), pathJoin(user_dict_path, "user.dict.utf8")),
+ model_(pathJoin(model_path, "hmm_model.utf8")),
+ mix_seg_(&dict_trie_, &model_) {
+ }
+ ~JiebaSegmenter() {
+ }
+
+ void Cut(const string& sentence, vector& words, bool hmm = true) const {
+ mix_seg_.Cut(sentence, words, hmm);
+ }
+
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+ return dict_trie_.InsertUserWord(word,freq, tag);
+ }
+
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+ return dict_trie_.DeleteUserWord(word, tag);
+ }
+
+ bool Find(const string& word)
+ {
+ return dict_trie_.Find(word);
+ }
+
+ void ResetSeparators(const string& s) {
+ mix_seg_.ResetSeparators(s);
+ }
+
+ const DictTrie* GetDictTrie() const {
+ return &dict_trie_;
+ }
+
+ const HMMModel* GetHMMModel() const {
+ return &model_;
+ }
+
+ void LoadUserDict(const vector& buf) {
+ dict_trie_.LoadUserDict(buf);
+ }
+
+ void LoadUserDict(const set& buf) {
+ dict_trie_.LoadUserDict(buf);
+ }
+
+ void LoadUserDict(const string& path) {
+ dict_trie_.LoadUserDict(path);
+ }
+
+ private:
+ static string pathJoin(const string& dir, const string& filename) {
+ if (dir.empty()) {
+ return filename;
+ }
+
+ char last_char = dir[dir.length() - 1];
+ if (last_char == '/' || last_char == '\\') {
+ return dir + filename;
+ } else {
+ #ifdef _WIN32
+ return dir + '\\' + filename;
+ #else
+ return dir + '/' + filename;
+ #endif
+ }
+ }
+
+ static string getCurrentDirectory() {
+ string path(__FILE__);
+ size_t pos = path.find_last_of("/\\");
+ return (pos == string::npos) ? "" : path.substr(0, pos);
+ }
+
+ DictTrie dict_trie_;
+ HMMModel model_;
+
+ MixSegment mix_seg_;
+}; // class JiebaSegmenter
+
+} // namespace cppjieba
+
+
+/// @brief Singleton wrapper around cppjieba::Jieba.
+class JiebaSingleton : public cppjieba::JiebaSegmenter {
+public:
+ /// @brief Returns the single instance, constructing on first call.
+ static JiebaSingleton& getInstance(const char* dictDir);
+
+ static JiebaSingleton& getInstance();
+
+ /// @brief Do thread-safe segmentation and compute word end offsets.
+ /// @param text The input text in UTF-8 encoding.
+ /// @param wordEndOffsets Output vector to hold byte offsets of word ends.
+ void getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets);
+
+ // singleton bookkeeping
+ static JiebaSingleton* instance;
+ static std::once_flag initFlag;
+
+private:
+ JiebaSingleton(const char* dictDir); ///< private ctor initializes base Jieba
+
+ /// Disable copy and move
+ JiebaSingleton(const JiebaSingleton&) = delete;
+ JiebaSingleton& operator = (const JiebaSingleton&) = delete;
+ JiebaSingleton(JiebaSingleton&&) = delete;
+ JiebaSingleton& operator = (JiebaSingleton&&) = delete;
+
+ std::mutex segMutex; ///< guards concurrent Cut() calls
+};
+
+#ifdef _WIN32
+# define JIEBA_API __declspec(dllexport)
+#else
+# define JIEBA_API
+#endif
+
+extern "C" {
+
+/// @brief Force singleton construction (load dicts, etc.) before any segmentation.
+JIEBA_API bool initJieba(const char* dictDir);
+
+/// @brief Segment UTF-8 text into character offsets.
+/// @return 0 on success, -1 on failure.
+JIEBA_API bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen);
+
+/// Wrapper for word management
+JIEBA_API bool insertUserWord(const char* word, int freq, const char* tag);
+JIEBA_API bool deleteUserWord(const char* word, const char* tag);
+JIEBA_API bool find(const char* word);
+
+/// @brief Free memory allocated by calculateWordOffsets.
+JIEBA_API void freeOffsets(int* ptr);
+
+} // extern "C"
+
+#endif // CPPJIEBA_DLL_H
diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript
new file mode 100644
index 00000000000..714c99330a9
--- /dev/null
+++ b/nvdaHelper/cppjieba/sconscript
@@ -0,0 +1,58 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import typing # noqa: E402
+import os
+
+Import(
+ [
+ "thirdPartyEnv",
+ "sourceDir",
+ ]
+)
+thirdPartyEnv: Environment = thirdPartyEnv
+env: Environment = typing.cast(Environment, thirdPartyEnv.Clone())
+
+cppjiebaPath = Dir("#include/cppjieba")
+cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba")
+cppjiebaDictPath = cppjiebaPath.Dir("dict")
+outDir = sourceDir.Dir("cppjieba")
+unitTestDictsDir = env.Dir("#tests/unit/cppjiebaDicts")
+LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency
+LimonpSrcPath = LimonpPath.Dir("include/limonp")
+
+env.Prepend(
+ CPPPATH=[
+ cppjiebaSrcPath,
+ LimonpSrcPath.Dir(".."),
+ ]
+)
+
+sourceFiles = [
+ "cppjieba.cpp",
+ "cppjieba.def",
+]
+
+env.AppendUnique(
+ CCFLAGS=['/wd4819'],
+ CXXFLAGS=['/wd4819'],
+)
+
+cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)
+
+if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning
+ env.Install(
+ outDir.Dir("dicts"),
+ [
+ env.Dir(cppjiebaDictPath).File(name)
+ for name in (
+ "jieba.dict.utf8",
+ "user.dict.utf8",
+ "hmm_model.utf8",
+ )
+ ]
+ )
+
+Return("cppjiebaLib")
diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md
index 6c7349c7415..30e5001b19d 100644
--- a/projectDocs/dev/createDevEnvironment.md
+++ b/projectDocs/dev/createDevEnvironment.md
@@ -97,6 +97,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch
* [Nullsoft Install System](https://nsis.sourceforge.io), version 3.11
* [Java Access Bridge 64 bit, from Zulu Community OpenJDK build 17.0.16+8 Zulu (17.60.17)](https://github.com/nvaccess/javaAccessBridge32-bin)
* [Windows Implementation Library (WIL)](https://github.com/microsoft/wil/), commit `7cf41936c5b4ab79daf0d9437211380dc69fa958`
+* [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9408c1d08facc6e324dc90260e8cb20ecceebf70`
#### Build time dependencies
diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py
index 9f77c762a0b..2e0d7cb00f8 100644
--- a/source/NVDAObjects/window/edit.py
+++ b/source/NVDAObjects/window/edit.py
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter
+# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
@@ -26,6 +26,7 @@
import watchdog
import locationHelper
import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
import NVDAHelper.localLib
@@ -163,6 +164,13 @@ class getTextLengthExStruct(ctypes.Structure):
class EditTextInfo(textInfos.offsets.OffsetsTextInfo):
+ # Override segFlags to enforce use of Uniscribe
+ charSegFlag = CharSegFlag.UNISCRIBE
+
+ @property
+ def wordSegFlag(self):
+ return WordSegFlag.UNISCRIBE
+
def _getPointFromOffset(self, offset):
if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3:
processHandle = self.obj.processHandle
diff --git a/source/braille.py b/source/braille.py
index 6e1d36be208..9752bdc2293 100644
--- a/source/braille.py
+++ b/source/braille.py
@@ -71,7 +71,7 @@
from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting
from utils.security import objectBelowLockScreenAndWindowsIsLocked, post_sessionLockStateChanged
from winAPI.secureDesktop import post_secureDesktopStateChange
-from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter
+from textUtils import isUnicodeNormalized, OffsetConverter, UnicodeNormalizationOffsetConverter
import hwIo
from editableText import EditableText
from gui.guiHelper import wxCallOnMain
@@ -600,10 +600,21 @@ def update(self):
if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None:
mode |= louis.compbrlAtCursor
- converter: UnicodeNormalizationOffsetConverter | None = None
+ converter: OffsetConverter | None = None
textToTranslate = self.rawText
textToTranslateTypeforms = self.rawTextTypeforms
cursorPos = self.cursorPos
+ if (
+ config.conf["braille"]["translationTable"].startswith("zh")
+ or config.conf["braille"]["translationTable"] == "auto"
+ and brailleTables.getDefaultTableForCurLang(brailleTables.TableType.OUTPUT).startswith("zh")
+ ):
+ from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401
+
+ converter = WordSegWithSeparatorOffsetConverter(textToTranslate)
+ textToTranslate = converter.encoded
+ if cursorPos is not None:
+ cursorPos = converter.strToEncodedOffsets(cursorPos)
if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(textToTranslate):
converter = UnicodeNormalizationOffsetConverter(textToTranslate)
textToTranslate = converter.encoded
diff --git a/source/browseMode.py b/source/browseMode.py
index a8cb791068b..6be86d59e71 100644
--- a/source/browseMode.py
+++ b/source/browseMode.py
@@ -1,6 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2007-2026 NV Access Limited, Babbage B.V., James Teh, Leonard de Ruijter,
-# Thomas Stivers, Accessolutions, Julien Cochuyt, Cyrille Bougot, Kefas Lungu
+# Thomas Stivers, Accessolutions, Julien Cochuyt, Cyrille Bougot, Kefas Lungu,
+# Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
diff --git a/source/config/configSpec.py b/source/config/configSpec.py
index 982532d4d5f..ab1d872b61a 100644
--- a/source/config/configSpec.py
+++ b/source/config/configSpec.py
@@ -1,7 +1,8 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt,
# Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith,
-# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, Kefas Lungu
+# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen,
+# Wang Chong, Kefas Lungu
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
@@ -279,6 +280,8 @@
reportClickable = boolean(default=true)
[documentNavigation]
+ initWordSegForUnusedLang = boolean(default=false)
+ wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto")
paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application")
[reviewCursor]
diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py
index 5bcb1db1fdb..59c78bef409 100644
--- a/source/config/featureFlagEnums.py
+++ b/source/config/featureFlagEnums.py
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
+# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
"""
Feature flag value enumerations.
@@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]:
}
+class WordNavigationUnitFlag(DisplayStringEnum):
+ """Enumeration for word navigation."""
+
+ @property
+ def _displayStringLabels(self):
+ return {
+ # Translators: Label for a method of word segmentation.
+ self.AUTO: _("Auto"),
+ # Translators: Label for a method of word segmentation.
+ self.UNISCRIBE: _("Standard"),
+ # Translators: Label for a method of word segmentation.
+ self.CHINESE: _("Chinese"),
+ }
+
+ DEFAULT = enum.auto()
+ AUTO = enum.auto()
+ UNISCRIBE = enum.auto()
+ CHINESE = enum.auto()
+
+
def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]:
for name, value in globals().items():
if (
diff --git a/source/core.py b/source/core.py
index 0deada7f022..dfffb773b05 100644
--- a/source/core.py
+++ b/source/core.py
@@ -1,6 +1,6 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner,
-# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt
+# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
@@ -926,6 +926,17 @@ def main():
_remoteClient.initialize()
+ from textUtils import wordSeg
+
+ log.debug("Initializing word segmentation module")
+
+ try:
+ wordSeg.initialize()
+ except RuntimeError:
+ log.warning("Word segmentation module disabled in configuration")
+ except Exception:
+ log.error("Error initializing word segmentation module", exc_info=True)
+
if globalVars.appArgs.install or globalVars.appArgs.installSilent:
import gui.installerGui
diff --git a/source/displayModel.py b/source/displayModel.py
index fde2cb0110e..941f11a6234 100644
--- a/source/displayModel.py
+++ b/source/displayModel.py
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
-# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot
+# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
from ctypes import byref, c_short, c_long
import unicodedata
@@ -22,6 +22,7 @@
import windowUtils
from locationHelper import RectLTRB, RectLTWH
import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
from typing import (
List,
Tuple,
@@ -525,7 +526,12 @@ def _getStoryLength(self):
return lineEndOffsets[-1]
return 0
- useUniscribe = False
+ # Override segFlags to strictly use the old fallen-back method
+ charSegFlag = CharSegFlag.NONE
+
+ @property
+ def wordSegFlag(self):
+ return WordSegFlag.NONE
def _getTextRange(self, start, end):
return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str))
diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py
index c549a6ad38c..39df035ba4c 100644
--- a/source/gui/settingsDialogs.py
+++ b/source/gui/settingsDialogs.py
@@ -6,7 +6,7 @@
# Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper,
# Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith,
# Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß, Tianze
-# Neil Soiffer, Ryan McCleary, Kefas Lungu.
+# Neil Soiffer, Ryan McCleary, Wang Chong, Kefas Lungu.
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
@@ -47,7 +47,6 @@
import speech
import speechDictHandler
import systemUtils
-from utils.security import isRunningOnSecureDesktop
import vision
import vision.providerBase
import vision.providerInfo
@@ -57,6 +56,8 @@
import screenCurtain._screenCurtain
from utils import mmdevice
+from utils.debounce import debounceLimiter
+from utils.security import isRunningOnSecureDesktop
from vision.providerBase import VisionEnhancementProviderSettings
from wx.lib.expando import ExpandoTextCtrl
import wx.lib.newevent
@@ -750,13 +751,16 @@ def _doCategoryChange(self, newCatId):
self.container.Thaw()
def onCategoryChange(self, evt: wx.ListEvent):
- currentCat = self.currentCategory
newIndex = evt.GetIndex()
- if not currentCat or newIndex != self.categoryClasses.index(currentCat.__class__):
+ if self._shouldDoCategoryChange(newIndex):
self._doCategoryChange(newIndex)
else:
evt.Skip()
+ def _shouldDoCategoryChange(self, index: int) -> bool:
+ currentCat = self.currentCategory
+ return not currentCat or index != self.categoryClasses.index(currentCat.__class__)
+
def _validateAllPanels(self):
"""Check if all panels are valid, and can be saved
@note: raises ValueError if a panel is not valid. See c{SettingsPanel.isValid}
@@ -3543,6 +3547,17 @@ class DocumentNavigationPanel(SettingsPanel):
def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer)
+
+ # Translators: This is a label for the word segmentation standard in the document navigation dialog
+ WordNavigationUnitLabel = _("&Word Segmentation Standard:")
+ self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
+ labelText=WordNavigationUnitLabel,
+ wxCtrlClass=nvdaControls.FeatureFlagCombo,
+ keyPath=["documentNavigation", "wordSegmentationStandard"],
+ conf=config.conf,
+ )
+ self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo)
+
# Translators: This is a label for the paragraph navigation style in the document navigation dialog
paragraphStyleLabel = _("&Paragraph style:")
self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
@@ -3554,8 +3569,21 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None:
self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo)
def onSave(self):
+ self.wordSegCombo.saveCurrentValueToConf()
self.paragraphStyleCombo.saveCurrentValueToConf()
+ def postSave(self):
+ from textUtils import wordSeg
+
+ log.debug("Reinitializing word segmentation module")
+
+ try:
+ wordSeg.initialize()
+ except RuntimeError:
+ log.warning("Word segmentation module disabled in configuration")
+ except Exception:
+ log.error("Error reinitializing word segmentation module", exc_info=True)
+
def _synthWarningDialog(newSynth: str):
gui.messageBox(
@@ -6433,6 +6461,7 @@ def _confirmEnableScreenCurtainWithUser(self) -> bool:
class NVDASettingsDialog(MultiCategorySettingsDialog):
# Translators: This is the label for the NVDA settings dialog.
title = _("NVDA Settings")
+ _pendingCategoryIndex: int | None = None
categoryClasses = [
GeneralSettingsPanel,
SpeechSettingsPanel,
@@ -6502,13 +6531,36 @@ def _getDialogTitle(self):
configProfile=NvdaSettingsDialogActiveConfigProfile,
)
+ def _doCategoryChangeForIndex(self, newIndex: int) -> bool:
+ if self._shouldDoCategoryChange(newIndex):
+ self._doCategoryChange(newIndex)
+ return True
+ return False
+
+ @debounceLimiter(
+ cooldownTimeMs=500,
+ delayTimeMs=500,
+ )
+ def _onCategoryChangeDebounced(self) -> None:
+ if self._pendingCategoryIndex is not None:
+ if self._doCategoryChangeForIndex(self._pendingCategoryIndex):
+ self._doOnCategoryChange()
+ self._pendingCategoryIndex = None
+
def onCategoryChange(self, evt: wx.ListEvent):
+ if isRunningOnSecureDesktop():
+ # Secure desktop can cause issues with rapidly changing categories,
+ # so we debounce category changes to avoid this. (#19634)
+ self._pendingCategoryIndex = evt.GetIndex()
+ self._onCategoryChangeDebounced()
+ return
super().onCategoryChange(evt)
if evt.Skipped:
return
self._doOnCategoryChange()
def Destroy(self):
+ self._pendingCategoryIndex = None
global NvdaSettingsDialogActiveConfigProfile, NvdaSettingsDialogWindowHandle
NvdaSettingsDialogActiveConfigProfile = None
NvdaSettingsDialogWindowHandle = None
diff --git a/source/setup.py b/source/setup.py
index 10e4ab18ff1..2eb4fd48849 100755
--- a/source/setup.py
+++ b/source/setup.py
@@ -351,6 +351,7 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]:
("images", glob("images/*.ico")),
("fonts", glob("fonts/*.ttf")),
("louis/tables", glob("louis/tables/*")),
+ ("cppjieba/dicts", glob("cppjieba/dicts/*")),
("COMRegistrationFixes", glob("COMRegistrationFixes/*.reg")),
("miscDeps/tools", ["../miscDeps/tools/msgfmt.exe"]),
(".", glob("../miscDeps/python/*.dll")),
diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py
index 1def339c7b4..11d8c297314 100755
--- a/source/textInfos/offsets.py
+++ b/source/textInfos/offsets.py
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter
+# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
@@ -8,12 +8,14 @@
import ctypes
import unicodedata
import NVDAHelper
+import config.featureFlagEnums
import NVDAState
import config
import textInfos
import locationHelper
from treeInterceptorHandler import TreeInterceptor
import textUtils
+from textUtils.segFlag import CharSegFlag, WordSegFlag
from dataclasses import dataclass
from typing import (
Optional,
@@ -156,8 +158,21 @@ class OffsetsTextInfo(textInfos.TextInfo):
#: Honours documentFormatting config option if true - set to false if this is not at all slow.
detectFormattingAfterCursorMaybeSlow: bool = True
- #: Use uniscribe to calculate word offsets etc.
- useUniscribe: bool = True
+ #: Method to calculate character and word offsets.
+ charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE
+
+ @property
+ def wordSegFlag(self) -> WordSegFlag | None:
+ match self.wordSegConf.calculated():
+ case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE:
+ return WordSegFlag.UNISCRIBE
+ case config.featureFlagEnums.WordNavigationUnitFlag.AUTO:
+ return WordSegFlag.AUTO
+ case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
+ return WordSegFlag.CHINESE
+ case _:
+ log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}")
+
#: The encoding internal to the underlying text info implementation.
encoding: Optional[str] = textUtils.WCHAR_ENCODING
@@ -377,7 +392,7 @@ def _getCharacterOffsets(self, offset):
lineStart, lineEnd = self._getLineOffsets(offset)
lineText = self._getTextRange(lineStart, lineEnd)
relOffset = offset - lineStart
- if self.useUniscribe:
+ if self.charSegFlag == CharSegFlag.UNISCRIBE:
offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset)
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
@@ -401,8 +416,10 @@ def _getWordOffsets(self, offset):
# Convert NULL and non-breaking space to space to make sure that words will break on them
lineText = lineText.translate({0: " ", 0xA0: " "})
relOffset = offset - lineStart
- if self.useUniscribe:
- offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset)
+ if self.wordSegFlag:
+ offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset(
+ relOffset,
+ )
if offsets is not None:
return (offsets[0] + lineStart, offsets[1] + lineStart)
# Fall back to the older word offsets detection that only breaks on non alphanumeric
@@ -476,6 +493,10 @@ def __init__(self, obj, position):
Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards.
"""
super(OffsetsTextInfo, self).__init__(obj, position)
+ self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][
+ "wordSegmentationStandard"
+ ]
+
from NVDAObjects import NVDAObject
if isinstance(position, locationHelper.Point):
@@ -562,6 +583,13 @@ def collapse(self, end=False):
self._startOffset = self._endOffset
def expand(self, unit):
+ if unit == textInfos.UNIT_WORD and self.isCollapsed and self._startOffset == self._getStoryLength():
+ try:
+ flowsTo = self.obj.flowsTo
+ except (AttributeError, NotImplementedError):
+ flowsTo = None
+ if not flowsTo:
+ return
self._startOffset, self._endOffset = self._getUnitOffsets(unit, self._startOffset)
def copy(self):
diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py
index 9086060fb19..edc6757dc9c 100644
--- a/source/textUtils/__init__.py
+++ b/source/textUtils/__init__.py
@@ -1,16 +1,18 @@
# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2018-2026 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
-# Copyright (C) 2018-2026 NV Access Limited, Babbage B.V., Łukasz Golonka
"""
Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
"""
import ctypes
+import re
import encodings
import locale
import unicodedata
+
from abc import ABCMeta, abstractmethod, abstractproperty
from functools import cached_property
from typing import Generator, Optional, Tuple, Type
@@ -18,6 +20,8 @@
from logHandler import log
from .uniscribe import splitAtCharacterBoundaries
+from .wordSeg import wordSegStrategy
+from .segFlag import WordSegFlag
WCHAR_ENCODING = "utf_16_le"
UTF8_ENCODING = "utf-8"
@@ -581,3 +585,62 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]:
return ENCODINGS_TO_CONVERTERS[encoding]
except IndexError as e:
raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e)
+
+
+class WordSegmenter:
+ """Selects appropriate segmentation strategy and segments text."""
+
+ # Precompiled patterns
+ # Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF)
+ _CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]")
+ # Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF)
+ _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]")
+
+ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO):
+ self.text: str = text
+ self.encoding: str | None = encoding
+ self.wordSegFlag: WordSegFlag = wordSegFlag
+ self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy()
+
+ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize
+ """Choose the appropriate segmentation strategy based on the text content."""
+ if self.wordSegFlag == WordSegFlag.AUTO:
+ if (
+ wordSegStrategy.ChineseWordSegmentationStrategy._lib
+ and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(
+ self.text,
+ )
+ and not WordSegmenter._KANA.search(self.text)
+ ):
+ return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
+ else:
+ return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+ else:
+ match self.wordSegFlag:
+ case WordSegFlag.UNISCRIBE:
+ return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+ case WordSegFlag.CHINESE:
+ if wordSegStrategy.ChineseWordSegmentationStrategy._lib:
+ return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
+ else:
+ log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.")
+ return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+ case _:
+ return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+
+ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
+ """Get the segment containing the given offset."""
+ try:
+ return self.strategy.getSegmentForOffset(offset)
+ except Exception as e:
+ log.debugWarning(
+ "WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s",
+ e,
+ self.text,
+ offset,
+ self.strategy,
+ )
+ return None
+
+ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
+ return self.strategy.segmentedText(sep, newSepIndex)
diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py
new file mode 100644
index 00000000000..72153c80e18
--- /dev/null
+++ b/source/textUtils/segFlag.py
@@ -0,0 +1,28 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from enum import IntFlag
+
+# shared bit masks (explicit powers of two)
+_AUTO: int = 1 << 0
+_UNISCRIBE: int = 1 << 1
+_CHINESE: int = 1 << 2
+
+
+class CharSegFlag(IntFlag):
+ """Character-level segmentation flags."""
+
+ NONE: int = 0
+ AUTO: int = _AUTO
+ UNISCRIBE: int = _UNISCRIBE
+
+
+class WordSegFlag(IntFlag):
+ """Word-level segmentation flags."""
+
+ NONE: int = 0
+ AUTO: int = _AUTO
+ UNISCRIBE: int = _UNISCRIBE
+ CHINESE: int = _CHINESE
diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py
new file mode 100644
index 00000000000..77231b58fa3
--- /dev/null
+++ b/source/textUtils/wordSeg/__init__.py
@@ -0,0 +1,46 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import importlib
+from logHandler import log
+
+
+def initialize():
+ """
+ Call all registered initializer functions recorded in wordSegStrategy.initializerList.
+
+ Each entry is a tuple: (module_name, qualname, func_obj, args, kwargs).
+ We try to resolve the callable from the module and qualname at runtime
+ (this handles classmethod/staticmethod wrapping order). If resolution fails,
+ we fall back to the stored func_obj.
+
+ Exceptions from individual initializers are caught and logged so that one
+ failing initializer doesn't stop the rest.
+ """
+
+ from . import wordSegStrategy
+ from threading import Thread
+
+ for module_name, qualname, func_obj, args, kwargs in wordSegStrategy.initializerList:
+ callable_to_call = None
+ # try to resolve module + qualname to a current attribute (handles classmethod/staticmethod)
+ try:
+ mod = importlib.import_module(module_name)
+ obj = mod
+ for part in qualname.split("."):
+ obj = getattr(obj, part)
+ callable_to_call = obj
+ except Exception:
+ # fallback to original function object captured during decoration
+ callable_to_call = func_obj
+
+ # Final call with its args/kwargs and exception handling
+ try:
+ if not callable(callable_to_call):
+ raise TypeError(f"Resolved initializer is not callable: {module_name}.{qualname}")
+ Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start()
+ except Exception as e:
+ log.debug("Initializer %s.%s failed: %s", module_name, qualname, e)
+ return
diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py
new file mode 100644
index 00000000000..7c9351170ac
--- /dev/null
+++ b/source/textUtils/wordSeg/wordSegStrategy.py
@@ -0,0 +1,331 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import os
+import ctypes
+from ctypes import (
+ c_bool,
+ c_char_p,
+ c_int,
+ create_string_buffer,
+ POINTER,
+ byref,
+)
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from collections.abc import Callable
+from typing import Any
+import re
+import unicodedata
+
+import textUtils
+from logHandler import log
+
+
+# Initializer registry (robust: saves module + qualname + original function + args/kwargs)
+# Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict)
+initializerList: list[tuple[str, str, Callable[..., Any], tuple[Any, ...], dict[str, Any]]] = []
+
+
+def initializerRegistry(*decorator_args, **decorator_kwargs):
+ """
+ A decorator to register an initializer function.
+ Usage:
+ @initializerRegistry
+ def f(): ...
+ or with arguments:
+ @initializerRegistry(arg1, arg2, kw=val)
+ def f(...): ...
+ We save (func.__module__, func.__qualname__, func, args, kwargs) so that during
+ package initialize() we can dynamically resolve the callable from the module
+ (this handles classmethod/staticmethod ordering issues).
+ """
+ if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs:
+ func = decorator_args[0]
+ initializerList.append((func.__module__, func.__qualname__, func, (), {}))
+ return func
+
+ def _decorator(func: Callable[..., Any]):
+ initializerList.append((func.__module__, func.__qualname__, func, decorator_args, decorator_kwargs))
+ return func
+
+ return _decorator
+
+
+class WordSegmentationStrategy(ABC):
+ """Abstract base class for word segmentation strategies."""
+
+ def __init__(self, text: str, encoding: str | None = None):
+ self.text: str = text
+ self.encoding: str | None = encoding
+ self.wordEnds: list[int] = []
+
+ @abstractmethod
+ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: # TODO: optimize
+ """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text."""
+ pass
+
+ @abstractmethod
+ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
+ """Segmented result with separators."""
+ pass
+
+ def getWordOffsetRange(
+ self,
+ offset: int,
+ ) -> tuple[int, int] | None:
+ """Helper to get word offset range from a list of word end offsets."""
+ if not self.wordEnds:
+ return None
+ index = next((i for i, end in enumerate(self.wordEnds) if end > offset), len(self.wordEnds) - 1)
+ start = 0 if index == 0 else self.wordEnds[index - 1]
+ end = self.wordEnds[index]
+ return (start, end)
+
+ @classmethod
+ def isUsingRelatedLanguage(cls) -> bool:
+ """Returns True if this strategy is for the current language."""
+
+ if not hasattr(cls, "_LANGUAGE_PATTERN"):
+ return False
+
+ import languageHandler
+ import braille
+
+ return (
+ re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage())
+ or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage())
+ or re.match(cls._LANGUAGE_PATTERN, braille.handler.table.fileName)
+ )
+
+
+class UniscribeWordSegmentationStrategy(WordSegmentationStrategy):
+ """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets)."""
+
+ # Copied from OffsetTextInfos. TODO: optimize
+ def _calculateUniscribeOffsets(
+ self,
+ lineText: str,
+ relOffset: int,
+ ) -> tuple[int, int] | None:
+ """
+ Calculates the bounds of a unit at an offset within a given string of text
+ using the Windows uniscribe library, also used in Notepad, for example.
+ Units supported are character and word.
+ @param lineText: the text string to analyze
+ @param relOffset: the character offset within the text string at which to calculate the bounds.
+ """
+
+ import NVDAHelper
+
+ helperFunc = NVDAHelper.localLib.calculateWordOffsets
+
+ relStart = ctypes.c_int()
+ relEnd = ctypes.c_int()
+ # uniscribe does some strange things
+ # when you give it a string with not more than two alphanumeric chars in a row.
+ # Inject two alphanumeric characters at the end to fix this
+ uniscribeLineText = lineText + "xx"
+ # We can't rely on len(lineText) to calculate the length of the line.
+ offsetConverter = textUtils.WideStringOffsetConverter(lineText)
+ lineLength = offsetConverter.encodedStringLength
+ if self.encoding != textUtils.WCHAR_ENCODING:
+ # We need to convert the str based line offsets to wide string offsets.
+ relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0]
+ uniscribeLineLength = lineLength + 2
+ if helperFunc(
+ uniscribeLineText,
+ uniscribeLineLength,
+ relOffset,
+ ctypes.byref(relStart),
+ ctypes.byref(relEnd),
+ ):
+ relStart = relStart.value
+ relEnd = min(lineLength, relEnd.value)
+ if self.encoding != textUtils.WCHAR_ENCODING:
+ # We need to convert the uniscribe based offsets to str offsets.
+ relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd)
+ return (relStart, relEnd)
+ return None
+
+ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
+ return self._calculateUniscribeOffsets(self.text, offset)
+
+ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
+ return self.text
+
+
+class ChineseWordSegmentationStrategy(WordSegmentationStrategy):
+ _lib = None
+ _LANGUAGE_PATTERN = re.compile(r"^zh", re.IGNORECASE)
+
+ @classmethod
+ @initializerRegistry
+ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative
+ """
+ Class-level initializer: attempts to load the versioned cppjieba library and
+ set up ctypes signatures.
+ """
+ import config
+
+ if not forceInit and (
+ cls._lib
+ or (
+ config.conf["documentNavigation"]["wordSegmentationStandard"].calculated()
+ != config.featureFlagEnums.WordNavigationUnitFlag.CHINESE
+ and not cls.isUsingRelatedLanguage()
+ )
+ ):
+ return
+ try:
+ from NVDAState import ReadPaths
+
+ lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll")
+ cls._lib = ctypes.cdll.LoadLibrary(lib_path)
+
+ # Setup function signatures
+ # bool initJieba(const char* dictDir)
+ cls._lib.initJieba.restype = c_bool
+ cls._lib.initJieba.argtypes = [c_char_p]
+
+ # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen)
+ cls._lib.calculateWordOffsets.restype = c_bool
+ cls._lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)]
+
+ # bool insertUserWord(const char* word, int freq, const char* tag)
+ cls._lib.insertUserWord.restype = c_bool
+ cls._lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p]
+
+ # bool deleteUserWord(const char* word, const char* tag)
+ cls._lib.deleteUserWord.restype = c_bool
+ cls._lib.deleteUserWord.argtypes = [c_char_p, c_char_p]
+
+ # bool find(const char* word)
+ cls._lib.find.restype = c_bool
+ cls._lib.find.argtypes = [c_char_p]
+
+ # void freeOffsets(int* offsets)
+ cls._lib.freeOffsets.restype = None
+ cls._lib.freeOffsets.argtypes = [POINTER(c_int)]
+
+ # Initialize with dictionary path
+ import globalVars
+
+ DICTS_DIR = os.path.join(globalVars.appDir, "cppjieba", "dicts")
+ DICTS_DIR_BYTES = DICTS_DIR.encode("utf-8")
+ dictDir = create_string_buffer(DICTS_DIR_BYTES)
+ cls._lib.initJieba(dictDir)
+ except Exception as e:
+ log.debugWarning("Failed to load cppjieba library: %s", e)
+ cls._lib = None
+
+ @lru_cache(maxsize=256)
+ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None:
+ if self._lib is None:
+ return None
+
+ charPtr = POINTER(c_int)()
+ outLen = c_int(0)
+
+ try:
+ success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen))
+ if not success or not bool(charPtr) or outLen.value <= 0:
+ return None
+
+ try:
+ n = outLen.value
+ offsets = [charPtr[i] for i in range(n)]
+ return offsets
+ finally:
+ self._lib.freeOffsets(charPtr)
+ except Exception as e:
+ log.debugWarning("Exception calling cppjieba: %s", e)
+ try:
+ if bool(charPtr):
+ self._lib.freeOffsets(charPtr)
+ except Exception:
+ pass
+ return None
+
+ def _callCPPJieba(self) -> list[int] | None:
+ """
+ Instance method: encode self.text and call cppjieba.
+ Returns list[int] on success, None on failure.
+ Uses LRU cache keyed by utf-8 bytes.
+ """
+ data = self.text.encode("utf-8")
+
+ if getattr(self, "_lib", None) is ChineseWordSegmentationStrategy._lib:
+ return self._callCppjiebaCached(data)
+ else:
+ if self._lib is None:
+ return None
+
+ charPtr = POINTER(c_int)()
+ outLen = c_int(0)
+ try:
+ success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen))
+ if not success or not bool(charPtr) or outLen.value <= 0:
+ return None
+
+ try:
+ n = outLen.value
+ return [charPtr[i] for i in range(n)]
+ finally:
+ self._lib.freeOffsets(charPtr)
+ except Exception as e:
+ log.debugWarning("Exception calling cppjieba: %s", e)
+ try:
+ if bool(charPtr):
+ self._lib.freeOffsets(charPtr)
+ except Exception:
+ pass
+ return None
+
+ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
+ """Segments the text using the word end indices."""
+
+ if len(self.wordEnds) <= 1:
+ return self.text
+
+ result = ""
+ for sepIndex in range(len(self.wordEnds) - 1):
+ preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
+ curIndex = self.wordEnds[sepIndex]
+ postIndex = self.wordEnds[sepIndex + 1]
+
+ # append the token before the potential separator position
+ result += self.text[preIndex:curIndex]
+
+ # quick checks: avoid adding duplicate separator if already present
+ if result.endswith(sep) or self.text[curIndex:postIndex].startswith(sep):
+ # separator already present at either side -> skip adding
+ continue
+
+ # Unicode categories for punctuation
+ PUNCTUATION_CATEGORIES: str = "pP"
+ # Determine whether any punctuation forbids a separator
+ noSep = (
+ unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES
+ or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES
+ )
+
+ if not noSep:
+ # If neither side forbids the separator, add it
+ result += sep
+ if newSepIndex is not None:
+ newSepIndex.append(len(result) - len(sep))
+ else:
+ # append the final trailing token after the loop
+ result += self.text[curIndex:postIndex]
+
+ return result
+
+ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
+ return self.getWordOffsetRange(offset)
+
+ def __init__(self, text, encoding=None):
+ super().__init__(text, encoding)
+ self.wordEnds = self._callCPPJieba()
diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py
new file mode 100644
index 00000000000..d26a26cd9ba
--- /dev/null
+++ b/source/textUtils/wordSeg/wordSegUtils.py
@@ -0,0 +1,119 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Wang Chong
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from functools import cached_property
+from textUtils import OffsetConverter, WordSegmenter
+
+
+class WordSegWithSeparatorOffsetConverter(OffsetConverter):
+ """An offset converter for text with word segmentation separator."""
+
+ sep: str = " "
+ computedStrToEncodedOffsets: list[int]
+ computedEncodedToStrOffsets: list[int]
+
+ def __init__(self, text: str):
+ super().__init__(text)
+ self.newSepIndex: list[int] = []
+ self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex)
+
+ @cached_property
+ def _separatorFlag(self) -> list[bool]:
+ isSep = [False] * self.encodedStringLength
+ for pos in self.newSepIndex:
+ isSep[pos] = True
+ return isSep
+
+ @cached_property
+ def computedStrToEncodedOffsets(self) -> list[int]:
+ """
+ Compute a list of offsets so that:
+ encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex]
+
+ We build an explicit mapping from original string indices to encoded indices
+ by marking separator positions in the encoded string and then assigning
+ each non-separator encoded slot to the next original-character index.
+ The returned list contains the delta (encodedIndex - strIndex) for each
+ original index.
+ """
+ strLen = self.strLength
+
+ # build explicit str -> encoded mapping
+ strToEncoded: list[int] = [0] * strLen
+ nextStrIndex = 0
+ for encodedIndex in range(self.encodedStringLength):
+ if not self._separatorFlag[encodedIndex]:
+ # assign the current original-char index to this encoded slot
+ # then advance to the next original index
+ if nextStrIndex >= strLen:
+ # defensive: there should not be more non-sep encoded slots than strLen
+ # but handle gracefully
+ break
+ strToEncoded[nextStrIndex] = encodedIndex
+ nextStrIndex += 1
+
+ return strToEncoded
+
+ @cached_property
+ def computedEncodedToStrOffsets(self) -> list[int]:
+ # build explicit encoded -> str mapping
+ # semantics: separator positions and the following encoded character
+ # both map to the same upcoming original str index (insertion point semantics).
+ encodedToStr: list[int] = [0] * self.encodedStringLength
+ nextStrIndex = 0
+ for encodedIndex in range(self.encodedStringLength):
+ if self._separatorFlag[encodedIndex]:
+ # map separator to the next original character index (insertion point)
+ encodedToStr[encodedIndex] = nextStrIndex
+ else:
+ # map this encoded character to the current original index,
+ # then advance the original index for subsequent positions
+ encodedToStr[encodedIndex] = nextStrIndex
+ nextStrIndex += 1
+
+ return encodedToStr
+
+ @cached_property
+ def encodedStringLength(self) -> int:
+ """Returns the length of the string in its subclass-specific encoded representation."""
+ return len(self.encoded)
+
+ def strToEncodedOffsets(
+ self,
+ strStart: int,
+ strEnd: int | None = None,
+ raiseOnError: bool = False,
+ ) -> int | tuple[int, int]:
+ super().strToEncodedOffsets(strStart, strEnd, raiseOnError)
+ if strStart == 0:
+ resultStart = 0
+ else:
+ resultStart = self.computedStrToEncodedOffsets[strStart]
+ if strEnd is None:
+ return resultStart
+ elif strStart == strEnd:
+ return (resultStart, resultStart)
+ else:
+ resultEnd = self.computedStrToEncodedOffsets[strEnd]
+ return (resultStart, resultEnd)
+
+ def encodedToStrOffsets(
+ self,
+ encodedStart: int,
+ encodedEnd: int | None = None,
+ raiseOnError: bool = False,
+ ) -> int | tuple[int]:
+ super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError)
+ if encodedStart == 0:
+ resultStart = 0
+ else:
+ resultStart = self.computedEncodedToStrOffsets[encodedStart]
+ if encodedEnd is None:
+ return resultStart
+ elif encodedStart == encodedEnd:
+ return (resultStart, resultStart)
+ else:
+ resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
+ return (resultStart, resultEnd)
diff --git a/tests/unit/test_textInfos.py b/tests/unit/test_textInfos.py
index 5ada05ff6ab..882e58782c3 100644
--- a/tests/unit/test_textInfos.py
+++ b/tests/unit/test_textInfos.py
@@ -176,6 +176,22 @@ def test_setEndpoint(self):
self.assertEqual((ti1._startOffset, ti1._endOffset), (5, 5))
+class TestWordExpansion(unittest.TestCase):
+ def test_expandWordDoesNotRequireFlowsToBeforeEndOfStory(self):
+ obj = BasicTextProvider(text="one two")
+ ti = obj.makeTextInfo(Offsets(0, 0))
+ ti.expand(textInfos.UNIT_WORD)
+ self.assertEqual(ti.text, "one ")
+
+ def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self):
+ obj = BasicTextProvider(text="one two")
+ ti = obj.makeTextInfo(textInfos.POSITION_ALL)
+ ti.collapse(end=True)
+ ti.expand(textInfos.UNIT_WORD)
+ self.assertEqual(ti.text, "")
+ self.assertEqual(ti.offsets, (7, 7))
+
+
class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase):
THREE_CHARS = "012"
TEN_CHARS = "0123456789"
diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py
index 6993ac7d962..048c8580e78 100644
--- a/tests/unit/test_textUtils.py
+++ b/tests/unit/test_textUtils.py
@@ -1,14 +1,15 @@
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
-# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter
+# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
"""Unit tests for the textUtils module."""
import unittest
-from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter
+from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter
from textUtils.uniscribe import splitAtCharacterBoundaries
+from textUtils.segFlag import WordSegFlag
FACE_PALM = "\U0001f926" # 🤦
SMILE = "\U0001f60a" # 😊
@@ -442,3 +443,28 @@ def test_sentenceWithComposites(self):
def test_hebrew(self):
self._testHelper("בְּרֵאשִׁית", ["בְּ", "רֵ", "א", "שִׁ", "י", "ת"])
+
+
+class TestWordSegmenter(unittest.TestCase):
+ """Tests for the WordSegmenter class."""
+
+ def test_basicLatin(self):
+ text = "hello world"
+ segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.UNISCRIBE)
+ self.assertEqual(segmenter.getSegmentForOffset(0), (0, 6))
+ self.assertEqual(segmenter.getSegmentForOffset(5), (0, 6))
+ self.assertEqual(segmenter.getSegmentForOffset(6), (6, 11))
+ self.assertEqual(segmenter.getSegmentForOffset(11), (6, 11))
+
+ def test_chinese(self):
+ text = "你好世界"
+
+ from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy
+
+ ChineseWordSegmentationStrategy._initCppJieba(forceInit=True)
+ segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.CHINESE)
+ self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2))
+ self.assertEqual(segmenter.getSegmentForOffset(1), (0, 2))
+ self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4))
+ self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4))
+ self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4))
diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md
index 83f9e17a0f4..2bfd839bf77 100644
--- a/user_docs/en/changes.md
+++ b/user_docs/en/changes.md
@@ -28,6 +28,9 @@
Consult the speech dictionaries section in the User Guide for more details. (#19506, @LeonarddeR)
* When resetting the configuration to factory defaults from the NVDA menu, a dialog is now shown afterwards with an Undo button to restore the previous configuration.
The triple-press keyboard shortcut (`NVDA+ctrl+r`) is not affected, as it is intended for recovery scenarios. (#19575, @bramd)
+* Chinese text can now be navigated by word using built-in input gestures.
+ Several GUI elements were added to configure this in the `Document Navigation` panel. (#18735, @CrazySteve0605)
+* Braille output for Chinese now includes spaces between words. (#18865, @CrazySteve0605)
* Added an unassigned command to report the current status of the Screen Curtain. (#19759)
* DotPad braille displays now support multi-button combination gestures. (#19565, @bramd)
* You can now press multiple buttons simultaneously to create custom gestures (e.g., `f1+panLeft`).
@@ -91,6 +94,7 @@ It only ran the translation string comment check, which is equivalent to `scons
The `scons checkPot` target has also been replaced with `runcheckpot.bat`.
Use the individual test commands instead: `runcheckpot.bat`, `rununittests.bat`, `runsystemtests.bat`, `runlint.bat`. (#19606, #19676, @bramd)
* Updated Python 3.13.11 to 3.13.12 (#19572, @dpy013)
+* Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for Chinese word segmentation. (#18548, @CrazySteve0605)
* Added a private `_asyncioEventLoop` module that provides an asyncio event loop running on a background thread for use by NVDA components. (#19816, @bramd)
* Added several functions related to the braille auto-scroll feature. (#18573, @nvdaes):
* Added an `autoScroll` method to `braille.handler`.
@@ -106,6 +110,8 @@ Use the individual test commands instead: `runcheckpot.bat`, `rununittests.bat`,
* The `speechDictHandler.ENTRY_TYPE_*` constants are deprecated.
Use the `speechDictHandler.types.EntryType` enumeration instead. (#19430, @LeonarddeR)
* `speechDictHandler.SpeechDictEntry` and `speechDictHandler.SpeechDict` have been moved to `speechDictHandler.types`. (#19430, @LeonarddeR)
+* `useUniscribe` from `textUtils.offset.OffsetsTextInfo` and its subclasses is deprecated.
+ Use `charSegFlag` and `wordSegFlag` instead. (#18735)