diff --git a/.gitattributes b/.gitattributes index 42076ebfce6..386b9e88c90 100644 --- a/.gitattributes +++ b/.gitattributes @@ -49,6 +49,7 @@ sconstruct text diff=python *.c text diff=c *.cpp text diff=cpp *.h text diff=c +*.hpp text diff=cpp *.idl text diff=c *.acf text diff=c diff --git a/.github/workflows/testAndPublish.yml b/.github/workflows/testAndPublish.yml index ac23f3154a6..a1253c9539e 100644 --- a/.github/workflows/testAndPublish.yml +++ b/.github/workflows/testAndPublish.yml @@ -78,7 +78,7 @@ jobs: - name: Checkout NVDA uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install Python uses: actions/setup-python@v6 with: diff --git a/.gitignore b/.gitignore index 263e375fdfe..1981646f2be 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ source/lib source/lib64 source/typelibs source/louis +source/cppjieba *.obj *.exp *.lib diff --git a/.gitmodules b/.gitmodules index 0fa0d910a9e..9791fec7baf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -42,3 +42,6 @@ [submodule "include/nvda-mathcat"] path = include/nvda-mathcat url = https://github.com/nvaccess/nvda-mathcat.git +[submodule "include/cppjieba"] + path = include/cppjieba + url = https://github.com/yanyiwu/cppjieba diff --git a/copying.txt b/copying.txt index 6b33fdaacec..014d8f7e9f9 100644 --- a/copying.txt +++ b/copying.txt @@ -1027,6 +1027,7 @@ In addition to these dependencies, the following are also included in NVDA: * Microsoft Detours: MIT * Python: PSF * NSIS: zlib/libpng +* cppjieba: MIT Furthermore, NVDA also utilises some static/binary dependencies, details of which can be found at the following URL: diff --git a/include/cppjieba b/include/cppjieba new file mode 160000 index 00000000000..9408c1d08fa --- /dev/null +++ b/include/cppjieba @@ -0,0 +1 @@ +Subproject commit 9408c1d08facc6e324dc90260e8cb20ecceebf70 diff --git a/include/readme.md b/include/readme.md index 12f09af1208..23bdfc8a227 100644 --- a/include/readme.md +++ b/include/readme.md @@ -61,3 +61,10 @@ Used in chrome system tests. Fetch latest from master. + +### cppjieba + +[cppjieba](https://github.com/yanyiwu/cppjieba). + +Fetch latest from master. +Used for Chinese text segmentation. diff --git a/nvdaHelper/archBuild_sconscript b/nvdaHelper/archBuild_sconscript index 22f73cd893a..a39e3b437d6 100644 --- a/nvdaHelper/archBuild_sconscript +++ b/nvdaHelper/archBuild_sconscript @@ -226,6 +226,10 @@ Export("detoursLib") apiHookObj = env.Object("apiHook", "common/apiHook.cpp") Export("apiHookObj") +cppjiebaLib = env.SConscript("cppjieba/sconscript") +Export("cppjiebaLib") +env.Install(libInstallDir, cppjiebaLib) + localLib = env.SConscript("local/sconscript") Export("localLib") if signExec: diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp new file mode 100644 index 00000000000..d63aa1f130c --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -0,0 +1,124 @@ +/* +A part of NonVisual Desktop Access (NVDA) +Copyright (C) 2025 NV Access Limited, Wang Chong +This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +*/ + +#include "cppjieba.hpp" + + +using namespace std; + +// static members for singleton bookkeeping +JiebaSingleton* JiebaSingleton::instance = nullptr; +std::once_flag JiebaSingleton::initFlag; + +JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) { + // convert incoming C-string+length to std::string (handles dictDir == nullptr) + std::string dir = dictDir; + + // ensure singleton is constructed exactly once + std::call_once(JiebaSingleton::initFlag, [&]() { + // allocate on heap, so we avoid copy/move and control lifetime + JiebaSingleton::instance = new JiebaSingleton(dir.c_str()); + // optional: register deleter at exit + std::atexit([]() { + delete JiebaSingleton::instance; + JiebaSingleton::instance = nullptr; + }); + }); + + // after call_once, instance must be non-null + return *JiebaSingleton::instance; +} + +JiebaSingleton& JiebaSingleton::getInstance() { + if (!JiebaSingleton::instance) { + throw std::runtime_error("JiebaSingleton::getInstance() called before initialization. Call getInstance(dictDir) or initJieba() first."); + } + return *JiebaSingleton::instance; +} + +JiebaSingleton::JiebaSingleton(const char* dictDir) +: cppjieba::JiebaSegmenter( + std::string(dictDir), + std::string(dictDir), + std::string(dictDir) + ) +{ + // base class ctor will load dictionaries/models +} + +void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets) { + std::lock_guard lock(segMutex); + wordEndOffsets.clear(); + std::vector words; + this->Cut(text, words, true); + + for (const auto& word : words) { + wordEndOffsets.push_back(word.unicode_offset + word.unicode_length); + } +} + +extern "C" { + +bool initJieba(const char* dictDir) { + try { + // simply force the singleton into existence + (void)JiebaSingleton::getInstance(dictDir); + return true; + } catch (...) { + return false; + } +} + +bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) { + if (!text || !wordEndOffsets || !outLen) return false; + + try { + std::string textStr(text); + std::vector offs; + JiebaSingleton::getInstance().getWordEndOffsets(textStr, offs); + + int n = static_cast(offs.size()); + if (n == 0) { + *wordEndOffsets = nullptr; + *outLen = 0; + return true; // success, but no offsets + } + + int* buf = static_cast(std::malloc(sizeof(int) * n)); + if (!buf) { + *wordEndOffsets = nullptr; + *outLen = 0; + return false; + } + for (int i = 0; i < n; ++i) buf[i] = offs[i]; + *wordEndOffsets = buf; + *outLen = n; + return true; + } catch (...) { + *wordEndOffsets = nullptr; + *outLen = 0; + return false; + } +} + +bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag)); +} + +bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag)); +} + +bool find(const char* word) { + return JiebaSingleton::getInstance().Find(string(word)); +} + +void freeOffsets(int* ptr) { + if (ptr) std::free(ptr); +} + +} // extern "C" diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def new file mode 100644 index 00000000000..fca4a152027 --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.def @@ -0,0 +1,8 @@ +LIBRARY cppjieba +EXPORTS + initJieba + calculateWordOffsets + insertUserWord + deleteUserWord + find + freeOffsets diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp new file mode 100644 index 00000000000..13ccf47acc6 --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -0,0 +1,165 @@ +/* +A part of NonVisual Desktop Access (NVDA) +Copyright (C) 2025 NV Access Limited, Wang Chong +This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +*/ + +#ifndef CPPJIEBA_DLL_H +#define CPPJIEBA_DLL_H +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "QuerySegment.hpp" + +using namespace std; + +namespace cppjieba { // copied from Jieba.hpp and modified to drop off its keyword extractor we don't use + +class JiebaSegmenter { + public: + JiebaSegmenter(const string& dict_path, + const string& model_path, + const string& user_dict_path) + : dict_trie_(pathJoin(dict_path, "jieba.dict.utf8"), pathJoin(user_dict_path, "user.dict.utf8")), + model_(pathJoin(model_path, "hmm_model.utf8")), + mix_seg_(&dict_trie_, &model_) { + } + ~JiebaSegmenter() { + } + + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } + + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + return dict_trie_.InsertUserWord(word,freq, tag); + } + + bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_.DeleteUserWord(word, tag); + } + + bool Find(const string& word) + { + return dict_trie_.Find(word); + } + + void ResetSeparators(const string& s) { + mix_seg_.ResetSeparators(s); + } + + const DictTrie* GetDictTrie() const { + return &dict_trie_; + } + + const HMMModel* GetHMMModel() const { + return &model_; + } + + void LoadUserDict(const vector& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const set& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const string& path) { + dict_trie_.LoadUserDict(path); + } + + private: + static string pathJoin(const string& dir, const string& filename) { + if (dir.empty()) { + return filename; + } + + char last_char = dir[dir.length() - 1]; + if (last_char == '/' || last_char == '\\') { + return dir + filename; + } else { + #ifdef _WIN32 + return dir + '\\' + filename; + #else + return dir + '/' + filename; + #endif + } + } + + static string getCurrentDirectory() { + string path(__FILE__); + size_t pos = path.find_last_of("/\\"); + return (pos == string::npos) ? "" : path.substr(0, pos); + } + + DictTrie dict_trie_; + HMMModel model_; + + MixSegment mix_seg_; +}; // class JiebaSegmenter + +} // namespace cppjieba + + +/// @brief Singleton wrapper around cppjieba::Jieba. +class JiebaSingleton : public cppjieba::JiebaSegmenter { +public: + /// @brief Returns the single instance, constructing on first call. + static JiebaSingleton& getInstance(const char* dictDir); + + static JiebaSingleton& getInstance(); + + /// @brief Do thread-safe segmentation and compute word end offsets. + /// @param text The input text in UTF-8 encoding. + /// @param wordEndOffsets Output vector to hold byte offsets of word ends. + void getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets); + + // singleton bookkeeping + static JiebaSingleton* instance; + static std::once_flag initFlag; + +private: + JiebaSingleton(const char* dictDir); ///< private ctor initializes base Jieba + + /// Disable copy and move + JiebaSingleton(const JiebaSingleton&) = delete; + JiebaSingleton& operator = (const JiebaSingleton&) = delete; + JiebaSingleton(JiebaSingleton&&) = delete; + JiebaSingleton& operator = (JiebaSingleton&&) = delete; + + std::mutex segMutex; ///< guards concurrent Cut() calls +}; + +#ifdef _WIN32 +# define JIEBA_API __declspec(dllexport) +#else +# define JIEBA_API +#endif + +extern "C" { + +/// @brief Force singleton construction (load dicts, etc.) before any segmentation. +JIEBA_API bool initJieba(const char* dictDir); + +/// @brief Segment UTF-8 text into character offsets. +/// @return 0 on success, -1 on failure. +JIEBA_API bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen); + +/// Wrapper for word management +JIEBA_API bool insertUserWord(const char* word, int freq, const char* tag); +JIEBA_API bool deleteUserWord(const char* word, const char* tag); +JIEBA_API bool find(const char* word); + +/// @brief Free memory allocated by calculateWordOffsets. +JIEBA_API void freeOffsets(int* ptr); + +} // extern "C" + +#endif // CPPJIEBA_DLL_H diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript new file mode 100644 index 00000000000..714c99330a9 --- /dev/null +++ b/nvdaHelper/cppjieba/sconscript @@ -0,0 +1,58 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import typing # noqa: E402 +import os + +Import( + [ + "thirdPartyEnv", + "sourceDir", + ] +) +thirdPartyEnv: Environment = thirdPartyEnv +env: Environment = typing.cast(Environment, thirdPartyEnv.Clone()) + +cppjiebaPath = Dir("#include/cppjieba") +cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") +cppjiebaDictPath = cppjiebaPath.Dir("dict") +outDir = sourceDir.Dir("cppjieba") +unitTestDictsDir = env.Dir("#tests/unit/cppjiebaDicts") +LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency +LimonpSrcPath = LimonpPath.Dir("include/limonp") + +env.Prepend( + CPPPATH=[ + cppjiebaSrcPath, + LimonpSrcPath.Dir(".."), + ] +) + +sourceFiles = [ + "cppjieba.cpp", + "cppjieba.def", +] + +env.AppendUnique( + CCFLAGS=['/wd4819'], + CXXFLAGS=['/wd4819'], +) + +cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) + +if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning + env.Install( + outDir.Dir("dicts"), + [ + env.Dir(cppjiebaDictPath).File(name) + for name in ( + "jieba.dict.utf8", + "user.dict.utf8", + "hmm_model.utf8", + ) + ] + ) + +Return("cppjiebaLib") diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md index 6c7349c7415..30e5001b19d 100644 --- a/projectDocs/dev/createDevEnvironment.md +++ b/projectDocs/dev/createDevEnvironment.md @@ -97,6 +97,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch * [Nullsoft Install System](https://nsis.sourceforge.io), version 3.11 * [Java Access Bridge 64 bit, from Zulu Community OpenJDK build 17.0.16+8 Zulu (17.60.17)](https://github.com/nvaccess/javaAccessBridge32-bin) * [Windows Implementation Library (WIL)](https://github.com/microsoft/wil/), commit `7cf41936c5b4ab79daf0d9437211380dc69fa958` +* [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9408c1d08facc6e324dc90260e8cb20ecceebf70` #### Build time dependencies diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index 9f77c762a0b..2e0d7cb00f8 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter +# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. @@ -26,6 +26,7 @@ import watchdog import locationHelper import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag import NVDAHelper.localLib @@ -163,6 +164,13 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): + # Override segFlags to enforce use of Uniscribe + charSegFlag = CharSegFlag.UNISCRIBE + + @property + def wordSegFlag(self): + return WordSegFlag.UNISCRIBE + def _getPointFromOffset(self, offset): if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3: processHandle = self.obj.processHandle diff --git a/source/braille.py b/source/braille.py index 6e1d36be208..9752bdc2293 100644 --- a/source/braille.py +++ b/source/braille.py @@ -71,7 +71,7 @@ from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting from utils.security import objectBelowLockScreenAndWindowsIsLocked, post_sessionLockStateChanged from winAPI.secureDesktop import post_secureDesktopStateChange -from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter +from textUtils import isUnicodeNormalized, OffsetConverter, UnicodeNormalizationOffsetConverter import hwIo from editableText import EditableText from gui.guiHelper import wxCallOnMain @@ -600,10 +600,21 @@ def update(self): if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None: mode |= louis.compbrlAtCursor - converter: UnicodeNormalizationOffsetConverter | None = None + converter: OffsetConverter | None = None textToTranslate = self.rawText textToTranslateTypeforms = self.rawTextTypeforms cursorPos = self.cursorPos + if ( + config.conf["braille"]["translationTable"].startswith("zh") + or config.conf["braille"]["translationTable"] == "auto" + and brailleTables.getDefaultTableForCurLang(brailleTables.TableType.OUTPUT).startswith("zh") + ): + from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401 + + converter = WordSegWithSeparatorOffsetConverter(textToTranslate) + textToTranslate = converter.encoded + if cursorPos is not None: + cursorPos = converter.strToEncodedOffsets(cursorPos) if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(textToTranslate): converter = UnicodeNormalizationOffsetConverter(textToTranslate) textToTranslate = converter.encoded diff --git a/source/browseMode.py b/source/browseMode.py index a8cb791068b..6be86d59e71 100644 --- a/source/browseMode.py +++ b/source/browseMode.py @@ -1,6 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2007-2026 NV Access Limited, Babbage B.V., James Teh, Leonard de Ruijter, -# Thomas Stivers, Accessolutions, Julien Cochuyt, Cyrille Bougot, Kefas Lungu +# Thomas Stivers, Accessolutions, Julien Cochuyt, Cyrille Bougot, Kefas Lungu, +# Wang Chong # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 982532d4d5f..ab1d872b61a 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -1,7 +1,8 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt, # Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith, -# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, Kefas Lungu +# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, +# Wang Chong, Kefas Lungu # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt @@ -279,6 +280,8 @@ reportClickable = boolean(default=true) [documentNavigation] + initWordSegForUnusedLang = boolean(default=false) + wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") [reviewCursor] diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 5bcb1db1fdb..59c78bef409 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Feature flag value enumerations. @@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: } +class WordNavigationUnitFlag(DisplayStringEnum): + """Enumeration for word navigation.""" + + @property + def _displayStringLabels(self): + return { + # Translators: Label for a method of word segmentation. + self.AUTO: _("Auto"), + # Translators: Label for a method of word segmentation. + self.UNISCRIBE: _("Standard"), + # Translators: Label for a method of word segmentation. + self.CHINESE: _("Chinese"), + } + + DEFAULT = enum.auto() + AUTO = enum.auto() + UNISCRIBE = enum.auto() + CHINESE = enum.auto() + + def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]: for name, value in globals().items(): if ( diff --git a/source/core.py b/source/core.py index 0deada7f022..dfffb773b05 100644 --- a/source/core.py +++ b/source/core.py @@ -1,6 +1,6 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner, -# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt +# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. @@ -926,6 +926,17 @@ def main(): _remoteClient.initialize() + from textUtils import wordSeg + + log.debug("Initializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except Exception: + log.error("Error initializing word segmentation module", exc_info=True) + if globalVars.appArgs.install or globalVars.appArgs.installSilent: import gui.installerGui diff --git a/source/displayModel.py b/source/displayModel.py index fde2cb0110e..941f11a6234 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from ctypes import byref, c_short, c_long import unicodedata @@ -22,6 +22,7 @@ import windowUtils from locationHelper import RectLTRB, RectLTWH import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from typing import ( List, Tuple, @@ -525,7 +526,12 @@ def _getStoryLength(self): return lineEndOffsets[-1] return 0 - useUniscribe = False + # Override segFlags to strictly use the old fallen-back method + charSegFlag = CharSegFlag.NONE + + @property + def wordSegFlag(self): + return WordSegFlag.NONE def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index c549a6ad38c..39df035ba4c 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -6,7 +6,7 @@ # Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper, # Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith, # Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß, Tianze -# Neil Soiffer, Ryan McCleary, Kefas Lungu. +# Neil Soiffer, Ryan McCleary, Wang Chong, Kefas Lungu. # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt @@ -47,7 +47,6 @@ import speech import speechDictHandler import systemUtils -from utils.security import isRunningOnSecureDesktop import vision import vision.providerBase import vision.providerInfo @@ -57,6 +56,8 @@ import screenCurtain._screenCurtain from utils import mmdevice +from utils.debounce import debounceLimiter +from utils.security import isRunningOnSecureDesktop from vision.providerBase import VisionEnhancementProviderSettings from wx.lib.expando import ExpandoTextCtrl import wx.lib.newevent @@ -750,13 +751,16 @@ def _doCategoryChange(self, newCatId): self.container.Thaw() def onCategoryChange(self, evt: wx.ListEvent): - currentCat = self.currentCategory newIndex = evt.GetIndex() - if not currentCat or newIndex != self.categoryClasses.index(currentCat.__class__): + if self._shouldDoCategoryChange(newIndex): self._doCategoryChange(newIndex) else: evt.Skip() + def _shouldDoCategoryChange(self, index: int) -> bool: + currentCat = self.currentCategory + return not currentCat or index != self.categoryClasses.index(currentCat.__class__) + def _validateAllPanels(self): """Check if all panels are valid, and can be saved @note: raises ValueError if a panel is not valid. See c{SettingsPanel.isValid} @@ -3543,6 +3547,17 @@ class DocumentNavigationPanel(SettingsPanel): def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) + + # Translators: This is a label for the word segmentation standard in the document navigation dialog + WordNavigationUnitLabel = _("&Word Segmentation Standard:") + self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( + labelText=WordNavigationUnitLabel, + wxCtrlClass=nvdaControls.FeatureFlagCombo, + keyPath=["documentNavigation", "wordSegmentationStandard"], + conf=config.conf, + ) + self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo) + # Translators: This is a label for the paragraph navigation style in the document navigation dialog paragraphStyleLabel = _("&Paragraph style:") self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( @@ -3554,8 +3569,21 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): + self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() + def postSave(self): + from textUtils import wordSeg + + log.debug("Reinitializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except Exception: + log.error("Error reinitializing word segmentation module", exc_info=True) + def _synthWarningDialog(newSynth: str): gui.messageBox( @@ -6433,6 +6461,7 @@ def _confirmEnableScreenCurtainWithUser(self) -> bool: class NVDASettingsDialog(MultiCategorySettingsDialog): # Translators: This is the label for the NVDA settings dialog. title = _("NVDA Settings") + _pendingCategoryIndex: int | None = None categoryClasses = [ GeneralSettingsPanel, SpeechSettingsPanel, @@ -6502,13 +6531,36 @@ def _getDialogTitle(self): configProfile=NvdaSettingsDialogActiveConfigProfile, ) + def _doCategoryChangeForIndex(self, newIndex: int) -> bool: + if self._shouldDoCategoryChange(newIndex): + self._doCategoryChange(newIndex) + return True + return False + + @debounceLimiter( + cooldownTimeMs=500, + delayTimeMs=500, + ) + def _onCategoryChangeDebounced(self) -> None: + if self._pendingCategoryIndex is not None: + if self._doCategoryChangeForIndex(self._pendingCategoryIndex): + self._doOnCategoryChange() + self._pendingCategoryIndex = None + def onCategoryChange(self, evt: wx.ListEvent): + if isRunningOnSecureDesktop(): + # Secure desktop can cause issues with rapidly changing categories, + # so we debounce category changes to avoid this. (#19634) + self._pendingCategoryIndex = evt.GetIndex() + self._onCategoryChangeDebounced() + return super().onCategoryChange(evt) if evt.Skipped: return self._doOnCategoryChange() def Destroy(self): + self._pendingCategoryIndex = None global NvdaSettingsDialogActiveConfigProfile, NvdaSettingsDialogWindowHandle NvdaSettingsDialogActiveConfigProfile = None NvdaSettingsDialogWindowHandle = None diff --git a/source/setup.py b/source/setup.py index 10e4ab18ff1..2eb4fd48849 100755 --- a/source/setup.py +++ b/source/setup.py @@ -351,6 +351,7 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: ("images", glob("images/*.ico")), ("fonts", glob("fonts/*.ttf")), ("louis/tables", glob("louis/tables/*")), + ("cppjieba/dicts", glob("cppjieba/dicts/*")), ("COMRegistrationFixes", glob("COMRegistrationFixes/*.reg")), ("miscDeps/tools", ["../miscDeps/tools/msgfmt.exe"]), (".", glob("../miscDeps/python/*.dll")), diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 1def339c7b4..11d8c297314 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt @@ -8,12 +8,14 @@ import ctypes import unicodedata import NVDAHelper +import config.featureFlagEnums import NVDAState import config import textInfos import locationHelper from treeInterceptorHandler import TreeInterceptor import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from dataclasses import dataclass from typing import ( Optional, @@ -156,8 +158,21 @@ class OffsetsTextInfo(textInfos.TextInfo): #: Honours documentFormatting config option if true - set to false if this is not at all slow. detectFormattingAfterCursorMaybeSlow: bool = True - #: Use uniscribe to calculate word offsets etc. - useUniscribe: bool = True + #: Method to calculate character and word offsets. + charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE + + @property + def wordSegFlag(self) -> WordSegFlag | None: + match self.wordSegConf.calculated(): + case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE: + return WordSegFlag.UNISCRIBE + case config.featureFlagEnums.WordNavigationUnitFlag.AUTO: + return WordSegFlag.AUTO + case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: + return WordSegFlag.CHINESE + case _: + log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") + #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING @@ -377,7 +392,7 @@ def _getCharacterOffsets(self, offset): lineStart, lineEnd = self._getLineOffsets(offset) lineText = self._getTextRange(lineStart, lineEnd) relOffset = offset - lineStart - if self.useUniscribe: + if self.charSegFlag == CharSegFlag.UNISCRIBE: offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) @@ -401,8 +416,10 @@ def _getWordOffsets(self, offset): # Convert NULL and non-breaking space to space to make sure that words will break on them lineText = lineText.translate({0: " ", 0xA0: " "}) relOffset = offset - lineStart - if self.useUniscribe: - offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset) + if self.wordSegFlag: + offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset( + relOffset, + ) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) # Fall back to the older word offsets detection that only breaks on non alphanumeric @@ -476,6 +493,10 @@ def __init__(self, obj, position): Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards. """ super(OffsetsTextInfo, self).__init__(obj, position) + self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][ + "wordSegmentationStandard" + ] + from NVDAObjects import NVDAObject if isinstance(position, locationHelper.Point): @@ -562,6 +583,13 @@ def collapse(self, end=False): self._startOffset = self._endOffset def expand(self, unit): + if unit == textInfos.UNIT_WORD and self.isCollapsed and self._startOffset == self._getStoryLength(): + try: + flowsTo = self.obj.flowsTo + except (AttributeError, NotImplementedError): + flowsTo = None + if not flowsTo: + return self._startOffset, self._endOffset = self._getUnitOffsets(unit, self._startOffset) def copy(self): diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 9086060fb19..edc6757dc9c 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -1,16 +1,18 @@ # A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2018-2026 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. -# Copyright (C) 2018-2026 NV Access Limited, Babbage B.V., Łukasz Golonka """ Classes and utilities to deal with offsets variable width encodings, particularly utf_16. """ import ctypes +import re import encodings import locale import unicodedata + from abc import ABCMeta, abstractmethod, abstractproperty from functools import cached_property from typing import Generator, Optional, Tuple, Type @@ -18,6 +20,8 @@ from logHandler import log from .uniscribe import splitAtCharacterBoundaries +from .wordSeg import wordSegStrategy +from .segFlag import WordSegFlag WCHAR_ENCODING = "utf_16_le" UTF8_ENCODING = "utf-8" @@ -581,3 +585,62 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: return ENCODINGS_TO_CONVERTERS[encoding] except IndexError as e: raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e) + + +class WordSegmenter: + """Selects appropriate segmentation strategy and segments text.""" + + # Precompiled patterns + # Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF) + _CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]") + # Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) + _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") + + def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO): + self.text: str = text + self.encoding: str | None = encoding + self.wordSegFlag: WordSegFlag = wordSegFlag + self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy() + + def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize + """Choose the appropriate segmentation strategy based on the text content.""" + if self.wordSegFlag == WordSegFlag.AUTO: + if ( + wordSegStrategy.ChineseWordSegmentationStrategy._lib + and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( + self.text, + ) + and not WordSegmenter._KANA.search(self.text) + ): + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + else: + match self.wordSegFlag: + case WordSegFlag.UNISCRIBE: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + case WordSegFlag.CHINESE: + if wordSegStrategy.ChineseWordSegmentationStrategy._lib: + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.") + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + case _: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + """Get the segment containing the given offset.""" + try: + return self.strategy.getSegmentForOffset(offset) + except Exception as e: + log.debugWarning( + "WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", + e, + self.text, + offset, + self.strategy, + ) + return None + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.strategy.segmentedText(sep, newSepIndex) diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py new file mode 100644 index 00000000000..72153c80e18 --- /dev/null +++ b/source/textUtils/segFlag.py @@ -0,0 +1,28 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from enum import IntFlag + +# shared bit masks (explicit powers of two) +_AUTO: int = 1 << 0 +_UNISCRIBE: int = 1 << 1 +_CHINESE: int = 1 << 2 + + +class CharSegFlag(IntFlag): + """Character-level segmentation flags.""" + + NONE: int = 0 + AUTO: int = _AUTO + UNISCRIBE: int = _UNISCRIBE + + +class WordSegFlag(IntFlag): + """Word-level segmentation flags.""" + + NONE: int = 0 + AUTO: int = _AUTO + UNISCRIBE: int = _UNISCRIBE + CHINESE: int = _CHINESE diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py new file mode 100644 index 00000000000..77231b58fa3 --- /dev/null +++ b/source/textUtils/wordSeg/__init__.py @@ -0,0 +1,46 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import importlib +from logHandler import log + + +def initialize(): + """ + Call all registered initializer functions recorded in wordSegStrategy.initializerList. + + Each entry is a tuple: (module_name, qualname, func_obj, args, kwargs). + We try to resolve the callable from the module and qualname at runtime + (this handles classmethod/staticmethod wrapping order). If resolution fails, + we fall back to the stored func_obj. + + Exceptions from individual initializers are caught and logged so that one + failing initializer doesn't stop the rest. + """ + + from . import wordSegStrategy + from threading import Thread + + for module_name, qualname, func_obj, args, kwargs in wordSegStrategy.initializerList: + callable_to_call = None + # try to resolve module + qualname to a current attribute (handles classmethod/staticmethod) + try: + mod = importlib.import_module(module_name) + obj = mod + for part in qualname.split("."): + obj = getattr(obj, part) + callable_to_call = obj + except Exception: + # fallback to original function object captured during decoration + callable_to_call = func_obj + + # Final call with its args/kwargs and exception handling + try: + if not callable(callable_to_call): + raise TypeError(f"Resolved initializer is not callable: {module_name}.{qualname}") + Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start() + except Exception as e: + log.debug("Initializer %s.%s failed: %s", module_name, qualname, e) + return diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py new file mode 100644 index 00000000000..7c9351170ac --- /dev/null +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -0,0 +1,331 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import os +import ctypes +from ctypes import ( + c_bool, + c_char_p, + c_int, + create_string_buffer, + POINTER, + byref, +) +from abc import ABC, abstractmethod +from functools import lru_cache +from collections.abc import Callable +from typing import Any +import re +import unicodedata + +import textUtils +from logHandler import log + + +# Initializer registry (robust: saves module + qualname + original function + args/kwargs) +# Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict) +initializerList: list[tuple[str, str, Callable[..., Any], tuple[Any, ...], dict[str, Any]]] = [] + + +def initializerRegistry(*decorator_args, **decorator_kwargs): + """ + A decorator to register an initializer function. + Usage: + @initializerRegistry + def f(): ... + or with arguments: + @initializerRegistry(arg1, arg2, kw=val) + def f(...): ... + We save (func.__module__, func.__qualname__, func, args, kwargs) so that during + package initialize() we can dynamically resolve the callable from the module + (this handles classmethod/staticmethod ordering issues). + """ + if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs: + func = decorator_args[0] + initializerList.append((func.__module__, func.__qualname__, func, (), {})) + return func + + def _decorator(func: Callable[..., Any]): + initializerList.append((func.__module__, func.__qualname__, func, decorator_args, decorator_kwargs)) + return func + + return _decorator + + +class WordSegmentationStrategy(ABC): + """Abstract base class for word segmentation strategies.""" + + def __init__(self, text: str, encoding: str | None = None): + self.text: str = text + self.encoding: str | None = encoding + self.wordEnds: list[int] = [] + + @abstractmethod + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: # TODO: optimize + """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" + pass + + @abstractmethod + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segmented result with separators.""" + pass + + def getWordOffsetRange( + self, + offset: int, + ) -> tuple[int, int] | None: + """Helper to get word offset range from a list of word end offsets.""" + if not self.wordEnds: + return None + index = next((i for i, end in enumerate(self.wordEnds) if end > offset), len(self.wordEnds) - 1) + start = 0 if index == 0 else self.wordEnds[index - 1] + end = self.wordEnds[index] + return (start, end) + + @classmethod + def isUsingRelatedLanguage(cls) -> bool: + """Returns True if this strategy is for the current language.""" + + if not hasattr(cls, "_LANGUAGE_PATTERN"): + return False + + import languageHandler + import braille + + return ( + re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) + or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) + or re.match(cls._LANGUAGE_PATTERN, braille.handler.table.fileName) + ) + + +class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): + """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" + + # Copied from OffsetTextInfos. TODO: optimize + def _calculateUniscribeOffsets( + self, + lineText: str, + relOffset: int, + ) -> tuple[int, int] | None: + """ + Calculates the bounds of a unit at an offset within a given string of text + using the Windows uniscribe library, also used in Notepad, for example. + Units supported are character and word. + @param lineText: the text string to analyze + @param relOffset: the character offset within the text string at which to calculate the bounds. + """ + + import NVDAHelper + + helperFunc = NVDAHelper.localLib.calculateWordOffsets + + relStart = ctypes.c_int() + relEnd = ctypes.c_int() + # uniscribe does some strange things + # when you give it a string with not more than two alphanumeric chars in a row. + # Inject two alphanumeric characters at the end to fix this + uniscribeLineText = lineText + "xx" + # We can't rely on len(lineText) to calculate the length of the line. + offsetConverter = textUtils.WideStringOffsetConverter(lineText) + lineLength = offsetConverter.encodedStringLength + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the str based line offsets to wide string offsets. + relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0] + uniscribeLineLength = lineLength + 2 + if helperFunc( + uniscribeLineText, + uniscribeLineLength, + relOffset, + ctypes.byref(relStart), + ctypes.byref(relEnd), + ): + relStart = relStart.value + relEnd = min(lineLength, relEnd.value) + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the uniscribe based offsets to str offsets. + relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) + return (relStart, relEnd) + return None + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + return self._calculateUniscribeOffsets(self.text, offset) + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.text + + +class ChineseWordSegmentationStrategy(WordSegmentationStrategy): + _lib = None + _LANGUAGE_PATTERN = re.compile(r"^zh", re.IGNORECASE) + + @classmethod + @initializerRegistry + def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative + """ + Class-level initializer: attempts to load the versioned cppjieba library and + set up ctypes signatures. + """ + import config + + if not forceInit and ( + cls._lib + or ( + config.conf["documentNavigation"]["wordSegmentationStandard"].calculated() + != config.featureFlagEnums.WordNavigationUnitFlag.CHINESE + and not cls.isUsingRelatedLanguage() + ) + ): + return + try: + from NVDAState import ReadPaths + + lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll") + cls._lib = ctypes.cdll.LoadLibrary(lib_path) + + # Setup function signatures + # bool initJieba(const char* dictDir) + cls._lib.initJieba.restype = c_bool + cls._lib.initJieba.argtypes = [c_char_p] + + # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) + cls._lib.calculateWordOffsets.restype = c_bool + cls._lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + + # bool insertUserWord(const char* word, int freq, const char* tag) + cls._lib.insertUserWord.restype = c_bool + cls._lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p] + + # bool deleteUserWord(const char* word, const char* tag) + cls._lib.deleteUserWord.restype = c_bool + cls._lib.deleteUserWord.argtypes = [c_char_p, c_char_p] + + # bool find(const char* word) + cls._lib.find.restype = c_bool + cls._lib.find.argtypes = [c_char_p] + + # void freeOffsets(int* offsets) + cls._lib.freeOffsets.restype = None + cls._lib.freeOffsets.argtypes = [POINTER(c_int)] + + # Initialize with dictionary path + import globalVars + + DICTS_DIR = os.path.join(globalVars.appDir, "cppjieba", "dicts") + DICTS_DIR_BYTES = DICTS_DIR.encode("utf-8") + dictDir = create_string_buffer(DICTS_DIR_BYTES) + cls._lib.initJieba(dictDir) + except Exception as e: + log.debugWarning("Failed to load cppjieba library: %s", e) + cls._lib = None + + @lru_cache(maxsize=256) + def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: + if self._lib is None: + return None + + charPtr = POINTER(c_int)() + outLen = c_int(0) + + try: + success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + offsets = [charPtr[i] for i in range(n)] + return offsets + finally: + self._lib.freeOffsets(charPtr) + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + self._lib.freeOffsets(charPtr) + except Exception: + pass + return None + + def _callCPPJieba(self) -> list[int] | None: + """ + Instance method: encode self.text and call cppjieba. + Returns list[int] on success, None on failure. + Uses LRU cache keyed by utf-8 bytes. + """ + data = self.text.encode("utf-8") + + if getattr(self, "_lib", None) is ChineseWordSegmentationStrategy._lib: + return self._callCppjiebaCached(data) + else: + if self._lib is None: + return None + + charPtr = POINTER(c_int)() + outLen = c_int(0) + try: + success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + return [charPtr[i] for i in range(n)] + finally: + self._lib.freeOffsets(charPtr) + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + self._lib.freeOffsets(charPtr) + except Exception: + pass + return None + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segments the text using the word end indices.""" + + if len(self.wordEnds) <= 1: + return self.text + + result = "" + for sepIndex in range(len(self.wordEnds) - 1): + preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1] + curIndex = self.wordEnds[sepIndex] + postIndex = self.wordEnds[sepIndex + 1] + + # append the token before the potential separator position + result += self.text[preIndex:curIndex] + + # quick checks: avoid adding duplicate separator if already present + if result.endswith(sep) or self.text[curIndex:postIndex].startswith(sep): + # separator already present at either side -> skip adding + continue + + # Unicode categories for punctuation + PUNCTUATION_CATEGORIES: str = "pP" + # Determine whether any punctuation forbids a separator + noSep = ( + unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES + or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES + ) + + if not noSep: + # If neither side forbids the separator, add it + result += sep + if newSepIndex is not None: + newSepIndex.append(len(result) - len(sep)) + else: + # append the final trailing token after the loop + result += self.text[curIndex:postIndex] + + return result + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + return self.getWordOffsetRange(offset) + + def __init__(self, text, encoding=None): + super().__init__(text, encoding) + self.wordEnds = self._callCPPJieba() diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py new file mode 100644 index 00000000000..d26a26cd9ba --- /dev/null +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -0,0 +1,119 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from functools import cached_property +from textUtils import OffsetConverter, WordSegmenter + + +class WordSegWithSeparatorOffsetConverter(OffsetConverter): + """An offset converter for text with word segmentation separator.""" + + sep: str = " " + computedStrToEncodedOffsets: list[int] + computedEncodedToStrOffsets: list[int] + + def __init__(self, text: str): + super().__init__(text) + self.newSepIndex: list[int] = [] + self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex) + + @cached_property + def _separatorFlag(self) -> list[bool]: + isSep = [False] * self.encodedStringLength + for pos in self.newSepIndex: + isSep[pos] = True + return isSep + + @cached_property + def computedStrToEncodedOffsets(self) -> list[int]: + """ + Compute a list of offsets so that: + encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex] + + We build an explicit mapping from original string indices to encoded indices + by marking separator positions in the encoded string and then assigning + each non-separator encoded slot to the next original-character index. + The returned list contains the delta (encodedIndex - strIndex) for each + original index. + """ + strLen = self.strLength + + # build explicit str -> encoded mapping + strToEncoded: list[int] = [0] * strLen + nextStrIndex = 0 + for encodedIndex in range(self.encodedStringLength): + if not self._separatorFlag[encodedIndex]: + # assign the current original-char index to this encoded slot + # then advance to the next original index + if nextStrIndex >= strLen: + # defensive: there should not be more non-sep encoded slots than strLen + # but handle gracefully + break + strToEncoded[nextStrIndex] = encodedIndex + nextStrIndex += 1 + + return strToEncoded + + @cached_property + def computedEncodedToStrOffsets(self) -> list[int]: + # build explicit encoded -> str mapping + # semantics: separator positions and the following encoded character + # both map to the same upcoming original str index (insertion point semantics). + encodedToStr: list[int] = [0] * self.encodedStringLength + nextStrIndex = 0 + for encodedIndex in range(self.encodedStringLength): + if self._separatorFlag[encodedIndex]: + # map separator to the next original character index (insertion point) + encodedToStr[encodedIndex] = nextStrIndex + else: + # map this encoded character to the current original index, + # then advance the original index for subsequent positions + encodedToStr[encodedIndex] = nextStrIndex + nextStrIndex += 1 + + return encodedToStr + + @cached_property + def encodedStringLength(self) -> int: + """Returns the length of the string in its subclass-specific encoded representation.""" + return len(self.encoded) + + def strToEncodedOffsets( + self, + strStart: int, + strEnd: int | None = None, + raiseOnError: bool = False, + ) -> int | tuple[int, int]: + super().strToEncodedOffsets(strStart, strEnd, raiseOnError) + if strStart == 0: + resultStart = 0 + else: + resultStart = self.computedStrToEncodedOffsets[strStart] + if strEnd is None: + return resultStart + elif strStart == strEnd: + return (resultStart, resultStart) + else: + resultEnd = self.computedStrToEncodedOffsets[strEnd] + return (resultStart, resultEnd) + + def encodedToStrOffsets( + self, + encodedStart: int, + encodedEnd: int | None = None, + raiseOnError: bool = False, + ) -> int | tuple[int]: + super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) + if encodedStart == 0: + resultStart = 0 + else: + resultStart = self.computedEncodedToStrOffsets[encodedStart] + if encodedEnd is None: + return resultStart + elif encodedStart == encodedEnd: + return (resultStart, resultStart) + else: + resultEnd = self.computedEncodedToStrOffsets[encodedEnd] + return (resultStart, resultEnd) diff --git a/tests/unit/test_textInfos.py b/tests/unit/test_textInfos.py index 5ada05ff6ab..882e58782c3 100644 --- a/tests/unit/test_textInfos.py +++ b/tests/unit/test_textInfos.py @@ -176,6 +176,22 @@ def test_setEndpoint(self): self.assertEqual((ti1._startOffset, ti1._endOffset), (5, 5)) +class TestWordExpansion(unittest.TestCase): + def test_expandWordDoesNotRequireFlowsToBeforeEndOfStory(self): + obj = BasicTextProvider(text="one two") + ti = obj.makeTextInfo(Offsets(0, 0)) + ti.expand(textInfos.UNIT_WORD) + self.assertEqual(ti.text, "one ") + + def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self): + obj = BasicTextProvider(text="one two") + ti = obj.makeTextInfo(textInfos.POSITION_ALL) + ti.collapse(end=True) + ti.expand(textInfos.UNIT_WORD) + self.assertEqual(ti.text, "") + self.assertEqual(ti.offsets, (7, 7)) + + class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase): THREE_CHARS = "012" TEN_CHARS = "0123456789" diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 6993ac7d962..048c8580e78 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -1,14 +1,15 @@ # A part of NonVisual Desktop Access (NVDA) # This file is covered by the GNU General Public License. # See the file COPYING for more details. -# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong """Unit tests for the textUtils module.""" import unittest -from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter +from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries +from textUtils.segFlag import WordSegFlag FACE_PALM = "\U0001f926" # 🤦 SMILE = "\U0001f60a" # 😊 @@ -442,3 +443,28 @@ def test_sentenceWithComposites(self): def test_hebrew(self): self._testHelper("בְּרֵאשִׁית", ["בְּ", "רֵ", "א", "שִׁ", "י", "ת"]) + + +class TestWordSegmenter(unittest.TestCase): + """Tests for the WordSegmenter class.""" + + def test_basicLatin(self): + text = "hello world" + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.UNISCRIBE) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(5), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(6), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(11), (6, 11)) + + def test_chinese(self): + text = "你好世界" + + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.CHINESE) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(1), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4)) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 83f9e17a0f4..2bfd839bf77 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -28,6 +28,9 @@ Consult the speech dictionaries section in the User Guide for more details. (#19506, @LeonarddeR) * When resetting the configuration to factory defaults from the NVDA menu, a dialog is now shown afterwards with an Undo button to restore the previous configuration. The triple-press keyboard shortcut (`NVDA+ctrl+r`) is not affected, as it is intended for recovery scenarios. (#19575, @bramd) +* Chinese text can now be navigated by word using built-in input gestures. + Several GUI elements were added to configure this in the `Document Navigation` panel. (#18735, @CrazySteve0605) +* Braille output for Chinese now includes spaces between words. (#18865, @CrazySteve0605) * Added an unassigned command to report the current status of the Screen Curtain. (#19759) * DotPad braille displays now support multi-button combination gestures. (#19565, @bramd) * You can now press multiple buttons simultaneously to create custom gestures (e.g., `f1+panLeft`). @@ -91,6 +94,7 @@ It only ran the translation string comment check, which is equivalent to `scons The `scons checkPot` target has also been replaced with `runcheckpot.bat`. Use the individual test commands instead: `runcheckpot.bat`, `rununittests.bat`, `runsystemtests.bat`, `runlint.bat`. (#19606, #19676, @bramd) * Updated Python 3.13.11 to 3.13.12 (#19572, @dpy013) +* Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for Chinese word segmentation. (#18548, @CrazySteve0605) * Added a private `_asyncioEventLoop` module that provides an asyncio event loop running on a background thread for use by NVDA components. (#19816, @bramd) * Added several functions related to the braille auto-scroll feature. (#18573, @nvdaes): * Added an `autoScroll` method to `braille.handler`. @@ -106,6 +110,8 @@ Use the individual test commands instead: `runcheckpot.bat`, `rununittests.bat`, * The `speechDictHandler.ENTRY_TYPE_*` constants are deprecated. Use the `speechDictHandler.types.EntryType` enumeration instead. (#19430, @LeonarddeR) * `speechDictHandler.SpeechDictEntry` and `speechDictHandler.SpeechDict` have been moved to `speechDictHandler.types`. (#19430, @LeonarddeR) +* `useUniscribe` from `textUtils.offset.OffsetsTextInfo` and its subclasses is deprecated. + Use `charSegFlag` and `wordSegFlag` instead. (#18735)