From 5cb5189d819d7e51be34e15467f92ce4a4f161e2 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 23 Jul 2025 17:30:56 +0800 Subject: [PATCH 01/93] Introduce cppjieba as a submodule for Chinese word segmentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `cppjieba` as a Git submodule under `third_party/cppjieba/` to provide robust Chinese word segmentation capabilities. - Update `.gitmodules` to point to the official `cppjieba` repository and configure it to track the `master` branch. - Update 'sconscript' to include the paths of 'cppjieba' and its dependency 'limonp' - Modify `copying.txt` to include the `cppjieba` license (MIT) alongside the project’s existing license, ensuring proper attribution and compliance. - Update documents --- .gitmodules | 3 +++ copying.txt | 1 + include/cppjieba | 1 + include/readme.md | 6 ++++++ nvdaHelper/local/sconscript | 14 ++++++++++++-- projectDocs/dev/createDevEnvironment.md | 1 + 6 files changed, 24 insertions(+), 2 deletions(-) create mode 160000 include/cppjieba diff --git a/.gitmodules b/.gitmodules index 209e147532f..a8e739852ac 100644 --- a/.gitmodules +++ b/.gitmodules @@ -39,3 +39,6 @@ path = .vscode url = https://github.com/nvaccess/vscode-nvda.git ignore = dirty +[submodule "include/cppjieba"] + path = include/cppjieba + url = https://github.com/yanyiwu/cppjieba diff --git a/copying.txt b/copying.txt index b56af8f7189..86a0cbd5ed9 100644 --- a/copying.txt +++ b/copying.txt @@ -356,6 +356,7 @@ In addition to these dependencies, the following are also included in NVDA: - Microsoft Detours: MIT - Python: PSF - NSIS: zlib/libpng +- cppjieba: MIT Furthermore, NVDA also utilises some static/binary dependencies, details of which can be found at the following URL: diff --git a/include/cppjieba b/include/cppjieba new file mode 160000 index 00000000000..9b40903ed6c --- /dev/null +++ b/include/cppjieba @@ -0,0 +1 @@ +Subproject commit 9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714 diff --git a/include/readme.md b/include/readme.md index e4f9b37fde2..0cb02de74ee 100644 --- a/include/readme.md +++ b/include/readme.md @@ -49,3 +49,9 @@ Used in chrome system tests. https://github.com/microsoft/wil/ Fetch latest from master. + +### cppjieba +[cppjieba](https://github.com/yanyiwu/cppjieba) + +Fetch latest from master. +Used for Chinese text segmentation. diff --git a/nvdaHelper/local/sconscript b/nvdaHelper/local/sconscript index bf03ed4c913..128becf137a 100644 --- a/nvdaHelper/local/sconscript +++ b/nvdaHelper/local/sconscript @@ -1,7 +1,7 @@ ### # This file is a part of the NVDA project. # URL: http://www.nvda-project.org/ -# Copyright 2006-2010 NVDA contributers. +# Copyright 2006-2025 NVDA contributers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2.0, as published by # the Free Software Foundation. @@ -113,8 +113,18 @@ localLib = env.SharedLibrary( "Gdiplus", "Iphlpapi", "Ws2_32", - "runtimeobject", + "runtimeobject", ], ) +cppjiebaPath = Dir("#include/cppjieba") +cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") +LimonpPath = cppjiebaPath.Dir("deps/limonp") +LimonpSrcPath = LimonpPath.Dir("include/limonp") + +env.Prepend(CPPPATH=[ + cppjiebaSrcPath, + LimonpSrcPath.Dir(".."), +]) + Return("localLib") diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md index 27233bef6d9..e9e876761bd 100644 --- a/projectDocs/dev/createDevEnvironment.md +++ b/projectDocs/dev/createDevEnvironment.md @@ -99,6 +99,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch * [Java Access Bridge 32 bit, from Zulu Community OpenJDK build 17.0.9+8Zulu (17.46.19)](https://github.com/nvaccess/javaAccessBridge32-bin) * [Windows Implementation Libraries (WIL)](https://github.com/microsoft/wil/) * [NVDA DiffMatchPatch](https://github.com/codeofdusk/nvda_dmp) +* [cppjieba](https://github.com/yanyiwu/cppjieba), commit '9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714' #### Build time dependencies From fb4efef4f66a5c0f626fe9a201d87e7030a6270b Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 24 Jul 2025 16:51:49 +0800 Subject: [PATCH 02/93] Update what's new --- user_docs/en/changes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 8b7f55445de..a15b2427c15 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -28,6 +28,7 @@ Please refer to [the developer guide](https://download.nvaccess.org/documentatio * Updated `include` dependencies: * detours to `9764cebcb1a75940e68fa83d6730ffaf0f669401`. (#18447, @LeonarddeR) * The `nvda_dmp` utility has been removed. (#18480, @codeofdusk) +* Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for word segmentation. (#18548, @CrazySteve0605) #### Deprecations From ae58e9b9d24884718e779c61c8de7b6651911a02 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 24 Jul 2025 16:59:26 +0800 Subject: [PATCH 03/93] Add comments for building script of cppjieba and its dependency --- nvdaHelper/local/sconscript | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nvdaHelper/local/sconscript b/nvdaHelper/local/sconscript index 128becf137a..8c747560d2f 100644 --- a/nvdaHelper/local/sconscript +++ b/nvdaHelper/local/sconscript @@ -117,9 +117,9 @@ localLib = env.SharedLibrary( ], ) -cppjiebaPath = Dir("#include/cppjieba") +cppjiebaPath = Dir("#include/cppjieba") # for Chinese word segmentation cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") -LimonpPath = cppjiebaPath.Dir("deps/limonp") +LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency LimonpSrcPath = LimonpPath.Dir("include/limonp") env.Prepend(CPPPATH=[ From 06070c12f7261ddac8dbd223e6d561950cad500d Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Fri, 1 Aug 2025 13:04:22 +0800 Subject: [PATCH 04/93] Update projectDocs/dev/createDevEnvironment.md Co-authored-by: Sean Budd --- projectDocs/dev/createDevEnvironment.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md index e9e876761bd..f7a1f65722f 100644 --- a/projectDocs/dev/createDevEnvironment.md +++ b/projectDocs/dev/createDevEnvironment.md @@ -98,8 +98,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch * [Nullsoft Install System](https://nsis.sourceforge.io), version 3.11 * [Java Access Bridge 32 bit, from Zulu Community OpenJDK build 17.0.9+8Zulu (17.46.19)](https://github.com/nvaccess/javaAccessBridge32-bin) * [Windows Implementation Libraries (WIL)](https://github.com/microsoft/wil/) -* [NVDA DiffMatchPatch](https://github.com/codeofdusk/nvda_dmp) -* [cppjieba](https://github.com/yanyiwu/cppjieba), commit '9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714' +* [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714` #### Build time dependencies From 2273a604e6df8fd0245512a4d3c26fe6bf5a0c66 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Fri, 1 Aug 2025 13:06:08 +0800 Subject: [PATCH 05/93] Update include/readme.md Co-authored-by: Sean Budd --- include/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/include/readme.md b/include/readme.md index 0cb02de74ee..d1e5c16ce94 100644 --- a/include/readme.md +++ b/include/readme.md @@ -51,6 +51,7 @@ https://github.com/microsoft/wil/ Fetch latest from master. ### cppjieba + [cppjieba](https://github.com/yanyiwu/cppjieba) Fetch latest from master. From 3d4d9f11b5a2f822ab4c5211dfe1a00df112d945 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Fri, 1 Aug 2025 13:59:53 +0800 Subject: [PATCH 06/93] Remove changes in sconscript for localLIb --- nvdaHelper/local/sconscript | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/nvdaHelper/local/sconscript b/nvdaHelper/local/sconscript index 8c747560d2f..bf03ed4c913 100644 --- a/nvdaHelper/local/sconscript +++ b/nvdaHelper/local/sconscript @@ -1,7 +1,7 @@ ### # This file is a part of the NVDA project. # URL: http://www.nvda-project.org/ -# Copyright 2006-2025 NVDA contributers. +# Copyright 2006-2010 NVDA contributers. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2.0, as published by # the Free Software Foundation. @@ -113,18 +113,8 @@ localLib = env.SharedLibrary( "Gdiplus", "Iphlpapi", "Ws2_32", - "runtimeobject", + "runtimeobject", ], ) -cppjiebaPath = Dir("#include/cppjieba") # for Chinese word segmentation -cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") -LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency -LimonpSrcPath = LimonpPath.Dir("include/limonp") - -env.Prepend(CPPPATH=[ - cppjiebaSrcPath, - LimonpSrcPath.Dir(".."), -]) - Return("localLib") From 1fbf05f481e3c3c8180d45a0fa99cf299a41b153 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 4 Aug 2025 09:59:03 +0800 Subject: [PATCH 07/93] add building script for cppjieba --- nvdaHelper/archBuild_sconscript | 6 +++++- nvdaHelper/cppjieba/sconscript | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 nvdaHelper/cppjieba/sconscript diff --git a/nvdaHelper/archBuild_sconscript b/nvdaHelper/archBuild_sconscript index 8ce0bd3e9b3..cb3345b30ac 100644 --- a/nvdaHelper/archBuild_sconscript +++ b/nvdaHelper/archBuild_sconscript @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2023 NV Access Limited +# Copyright (C) 2006-2025 NV Access Limited # This file may be used under the terms of the GNU General Public License, version 2 or later. # For more details see: https://www.gnu.org/licenses/gpl-2.0.html @@ -209,6 +209,10 @@ Export("detoursLib") apiHookObj = env.Object("apiHook", "common/apiHook.cpp") Export("apiHookObj") +cppjiebaLib = env.SConscript("cppjieba/sconscript") +Export("cppjiebaLib") +env.Install(libInstallDir, cppjiebaLib) + if TARGET_ARCH == "x86": localLib = env.SConscript("local/sconscript") Export("localLib") diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript new file mode 100644 index 00000000000..0dc2dc992a7 --- /dev/null +++ b/nvdaHelper/cppjieba/sconscript @@ -0,0 +1,33 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong. +# This file may be used under the terms of the GNU General Public License, version 2 or later. +# For more details see: https://www.gnu.org/licenses/gpl-2.0.html + +Import(["thirdPartyEnv"]) + +import typing # noqa: E402 +import os + +thirdPartyEnv: Environment = thirdPartyEnv +env: Environment = typing.cast(Environment, thirdPartyEnv.Clone()) + +cppjiebaPath = Dir("#include/cppjieba") +cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") +LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency +LimonpSrcPath = LimonpPath.Dir("include/limonp") + +env.Prepend( + CPPPATH=[ + cppjiebaSrcPath, + LimonpSrcPath.Dir(".."), + ] +) + +sourceFiles = [ + "cppjieba.cpp", + "cppjieba.def", +] + +cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) + +Return("cppjiebaLib") From 7de7464efd94551d100628774e63cad7505bfc35 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 5 Aug 2025 12:52:39 +0800 Subject: [PATCH 08/93] add JiebaSingleton wrapper and C API for NVDA segmentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce `JiebaSingleton` class in `cppjieba.hpp`/`cppjieba.cpp` with def file under nvdaHelper/cppjieba/' - Inherits from `cppjieba::Jieba` and exposes a thread-safe `getOffsets()` method - Implements Meyers’ singleton via `getInstance()` with a private constructor - Deletes copy constructor, copy assignment, move constructor, and move assignment to enforce single instance - Add C-style API in the same module: - `int initJieba()` to force singleton initialization - `int segmentOffsets(const char* text, int** charOffsets, int* outLen)` to perform segmentation and return character offsets - `void freeOffsets(int* ptr)` to release allocated offset buffer --- nvdaHelper/cppjieba/cppjieba.cpp | 85 ++++++++++++++++++++++++++++++++ nvdaHelper/cppjieba/cppjieba.def | 5 ++ nvdaHelper/cppjieba/cppjieba.hpp | 71 ++++++++++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 nvdaHelper/cppjieba/cppjieba.cpp create mode 100644 nvdaHelper/cppjieba/cppjieba.def create mode 100644 nvdaHelper/cppjieba/cppjieba.hpp diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp new file mode 100644 index 00000000000..cda25a9d1f9 --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -0,0 +1,85 @@ +/* +This file is a part of the NVDA project. +URL: http://www.nvda-project.org/ +Copyright 2025 NV Access Limited, Wang Chong. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2.0, as published by + the Free Software Foundation. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +This license can be found at: +http://www.gnu.org/licenses/old-licenses/gpl-2.0.html +*/ + +#include "cppjieba.hpp" + +JiebaSingleton& JiebaSingleton::getInstance() { + // C++11 guarantees thread-safe init of this local static + static JiebaSingleton instance; + return instance; +} + +JiebaSingleton::JiebaSingleton(): cppjieba::Jieba() { } // call base ctor to load dictionaries, models, etc. + +void JiebaSingleton::getOffsets(const std::string& text, std::vector& charOffsets) { + std::lock_guard lock(segMutex); + std::vector words; + this->Cut(text, words, true); + + int cumulative = 0; + for (auto const& w : words) { + int wc = 0; + auto ptr = reinterpret_cast(w.c_str()); + size_t i = 0, len = w.size(); + while (i < len) { + unsigned char c = ptr[i]; + if ((c & 0x80) == 0) i += 1; + else if ((c & 0xE0) == 0xC0) i += 2; + else if ((c & 0xF0) == 0xE0) i += 3; + else if ((c & 0xF8) == 0xF0) i += 4; + else i += 1; + ++wc; + } + cumulative += wc; + charOffsets.push_back(cumulative); + } +} + +extern "C" { + +int initJieba() { + try { + // simply force the singleton into existence + (void)JiebaSingleton::getInstance(); + return 0; + } catch (...) { + return -1; + } +} + +int segmentOffsets(const char* text, int** charOffsets, int* outLen) { + if (!text || !charOffsets || !outLen) return -1; + // we assume initJieba() has already been called successfully + + std::string input(text); + std::vector offs; + JiebaSingleton::getInstance().getOffsets(input, offs); + + int n = static_cast(offs.size()); + int* buf = static_cast(std::malloc(sizeof(int) * n)); + if (!buf) { + *outLen = 0; + return -1; + } + for (int i = 0; i < n; ++i) buf[i] = offs[i]; + *charOffsets = buf; + *outLen = n; + return 0; +} + +void freeOffsets(int* ptr) { + if (ptr) free(ptr); +} + +} // extern "C" diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def new file mode 100644 index 00000000000..cc2246d0f21 --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.def @@ -0,0 +1,5 @@ +LIBRARY cppjieba +EXPORTS + initJieba + segmentOffsets + freeOffsets diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp new file mode 100644 index 00000000000..ca940e305a5 --- /dev/null +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -0,0 +1,71 @@ +/* +This file is a part of the NVDA project. +URL: http://www.nvda-project.org/ +Copyright 2025 NV Access Limited, Wang Chong. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2.0, as published by + the Free Software Foundation. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +This license can be found at: +http://www.gnu.org/licenses/old-licenses/gpl-2.0.html +*/ + +#ifndef CPPJIEBA_DLL_H +#define CPPJIEBA_DLL_H +#pragma once + +#include +#include +#include +#include +#include "Jieba.hpp" + +#ifdef _WIN32 +# define JIEBA_API __declspec(dllexport) +#else +# define JIEBA_API +#endif + +using namespace std; + +/// @brief Singleton wrapper around cppjieba::Jieba. +class JiebaSingleton : public cppjieba::Jieba { +public: + /// @brief Returns the single instance, constructing on first call. + static JiebaSingleton& getInstance(); + + /// @brief Do thread-safe segmentation and compute character end offsets. + /// @param text The input text in UTF-8 encoding. + /// @param charOffsets Output vector to hold character offsets. + void getOffsets(const string& text, vector& charOffsets); + +private: + JiebaSingleton(); ///< private ctor initializes base Jieba + + /// Disable copy and move + JiebaSingleton(const JiebaSingleton&) = delete; + JiebaSingleton& operator = (const JiebaSingleton&) = delete; + JiebaSingleton(JiebaSingleton&&) = delete; + JiebaSingleton& operator = (JiebaSingleton&&) = delete; + + std::mutex segMutex; ///< guards concurrent Cut() calls +}; + +extern "C" { + +/// @brief Force singleton construction (load dicts, etc.) before any segmentation. +/// @return 0 on success, -1 on failure. +JIEBA_API int initJieba(); + +/// @brief Segment UTF-8 text into character offsets. +/// @return 0 on success, -1 on failure. +JIEBA_API int segmentOffsets(const char* text, int** charOffsets, int* outLen); + +/// @brief Free memory allocated by segmentOffsets. +JIEBA_API void freeOffsets(int* ptr); + +} // extern "C" + +#endif // CPPJIEBA_DLL_H From 0d92c08625fbf04193776180e94fe0e4666c48d7 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 6 Aug 2025 12:05:54 +0800 Subject: [PATCH 09/93] Update GitHub action workflow to fetch cppjieba's submodule - Change 'submodules' in 'jobs - buildNVDA - Build NVDA - Checkout NVDA' from 'true' to 'recursive' to ensure cppjieba's submodule is fetched. - This will cause the submodule of sonic to be fetched as well, which seems currently unused. --- .github/workflows/testAndPublish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/testAndPublish.yml b/.github/workflows/testAndPublish.yml index 0eb83f0db88..2982ee6792a 100644 --- a/.github/workflows/testAndPublish.yml +++ b/.github/workflows/testAndPublish.yml @@ -53,7 +53,7 @@ jobs: - name: Checkout NVDA uses: actions/checkout@v4 with: - submodules: true + submodules: recursive - name: Install Python uses: actions/setup-python@v5 with: From da662bed34b6f58890dde4cc367151ba719b540b Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 6 Aug 2025 21:27:44 +0800 Subject: [PATCH 10/93] Update .gitignore for cppjieba --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7c26e12c19b..71fa927eb8f 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ source/lib source/lib64 source/typelibs source/louis +source/cppjieba *.obj *.exp *.lib From 38a12dcffedf5f08e83789be6ad6d2bb9d3df49a Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 6 Aug 2025 21:29:22 +0800 Subject: [PATCH 11/93] Update building and setup script for cppjieba's dicts installation --- nvdaHelper/cppjieba/sconscript | 26 ++++++++++++++++++++++++-- source/setup.py | 1 + 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 0dc2dc992a7..6ac52da0be7 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -3,16 +3,23 @@ # This file may be used under the terms of the GNU General Public License, version 2 or later. # For more details see: https://www.gnu.org/licenses/gpl-2.0.html -Import(["thirdPartyEnv"]) - import typing # noqa: E402 import os +Import( + [ + "thirdPartyEnv", + "sourceDir", + ] +) thirdPartyEnv: Environment = thirdPartyEnv env: Environment = typing.cast(Environment, thirdPartyEnv.Clone()) cppjiebaPath = Dir("#include/cppjieba") cppjiebaSrcPath = cppjiebaPath.Dir("include/cppjieba") +cppjiebaDictPath = cppjiebaPath.Dir("dict") +outDir = sourceDir.Dir("cppjieba") +unitTestDictsDir = env.Dir("#tests/unit/cppjiebaDicts") LimonpPath = cppjiebaPath.Dir("deps/limonp") # cppjieba's dependency LimonpSrcPath = LimonpPath.Dir("include/limonp") @@ -30,4 +37,19 @@ sourceFiles = [ cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) +if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning + env.Install( + outDir.Dir("dicts"), + [ + f + for f in env.Glob(f"{cppjiebaDictPath}/*") + if f.name + not in ( + "README.md", + "pos_dict", + ) + and not f.name.endswith(".in") + ], + ) + Return("cppjiebaLib") diff --git a/source/setup.py b/source/setup.py index 2b74b89b32b..77883e574b2 100755 --- a/source/setup.py +++ b/source/setup.py @@ -263,6 +263,7 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: ("images", glob("images/*.ico")), ("fonts", glob("fonts/*.ttf")), ("louis/tables", glob("louis/tables/*")), + ("cppjieba/dicts", glob("cppjieba/dicts/*")), ("COMRegistrationFixes", glob("COMRegistrationFixes/*.reg")), ("miscDeps/tools", ["../miscDeps/tools/msgfmt.exe"]), (".", glob("../miscDeps/python/*.dll")), From c60c2da80fbf2466a2b50d445e5fcfba29232146 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 9 Aug 2025 20:30:58 +0800 Subject: [PATCH 12/93] update copyright headers based on @seanbudd's suggestions --- nvdaHelper/cppjieba/cppjieba.cpp | 15 ++++----------- nvdaHelper/cppjieba/cppjieba.hpp | 15 ++++----------- nvdaHelper/cppjieba/sconscript | 6 +++--- 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index cda25a9d1f9..ccd972385e9 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -1,15 +1,8 @@ /* -This file is a part of the NVDA project. -URL: http://www.nvda-project.org/ -Copyright 2025 NV Access Limited, Wang Chong. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2.0, as published by - the Free Software Foundation. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -This license can be found at: -http://www.gnu.org/licenses/old-licenses/gpl-2.0.html +A part of NonVisual Desktop Access (NVDA) +Copyright (C) 2025 NV Access Limited, Wang Chong +This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt */ #include "cppjieba.hpp" diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index ca940e305a5..c5f0bad4af7 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -1,15 +1,8 @@ /* -This file is a part of the NVDA project. -URL: http://www.nvda-project.org/ -Copyright 2025 NV Access Limited, Wang Chong. - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2.0, as published by - the Free Software Foundation. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -This license can be found at: -http://www.gnu.org/licenses/old-licenses/gpl-2.0.html +A part of NonVisual Desktop Access (NVDA) +Copyright (C) 2025 NV Access Limited, Wang Chong +This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt */ #ifndef CPPJIEBA_DLL_H diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 6ac52da0be7..1a1fd12a2c2 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2025 NV Access Limited, Wang Chong. -# This file may be used under the terms of the GNU General Public License, version 2 or later. -# For more details see: https://www.gnu.org/licenses/gpl-2.0.html +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt import typing # noqa: E402 import os From c853b64063faa3bfca92740e60f8e424d266e914 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 9 Aug 2025 20:33:12 +0800 Subject: [PATCH 13/93] Update include/readme.md Co-authored-by: Sean Budd --- include/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/readme.md b/include/readme.md index d1e5c16ce94..175363fe02a 100644 --- a/include/readme.md +++ b/include/readme.md @@ -52,7 +52,7 @@ Fetch latest from master. ### cppjieba -[cppjieba](https://github.com/yanyiwu/cppjieba) +[cppjieba](https://github.com/yanyiwu/cppjieba). Fetch latest from master. Used for Chinese text segmentation. From b0ac0819632f9f405e44f763cdbac272d2fc1964 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 18 Aug 2025 11:38:48 +0800 Subject: [PATCH 14/93] add `WordSegment` module - create `WordSegmentationStrategy' as an abstract base class to select segmentation strategy based on text content, following Strategy Pattern - implement `ChineseWordSegmentationStrategy` (for Chinese text) - implement `UniscribeWordSegmentationStrategy` (for other languages as default strategy) --- source/textUtils/wordSegment.py | 169 ++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 source/textUtils/wordSegment.py diff --git a/source/textUtils/wordSegment.py b/source/textUtils/wordSegment.py new file mode 100644 index 00000000000..94f82fb860d --- /dev/null +++ b/source/textUtils/wordSegment.py @@ -0,0 +1,169 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import os +import ctypes +from ctypes import c_char_p, c_int, POINTER, byref +from abc import ABC, abstractmethod +from functools import lru_cache +from logHandler import log +import textUtils + +class WordSegmentationStrategy(ABC): + """Abstract base class for word segmentation strategies.""" + + def __init__(self, text: str, encoding: str | None = None): + self.text = text + self.encoding = encoding + + @abstractmethod + def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: # TODO: optimize + """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" + pass + + +class ChineseWordSegmentationStrategy(WordSegmentationStrategy): + _lib = None + + def __init__(self, text: str, encoding: str | None = None): + super().__init__(text, encoding) + self._ensureLibLoaded() + + @classmethod + def _ensureLibLoaded(cls): # TODO: make cppjieba alternative + if cls._lib is not None: + return + try: + from NVDAHelper import versionedLibPath + lib_path = os.path.join(versionedLibPath, "cppjieba.dll") + cls._lib = ctypes.cdll.LoadLibrary(lib_path) + # Setup function signatures (adjust if your C API differs) + cls._lib.initJieba.restype = c_int + cls._lib.initJieba.argtypes = [] + + # int segmentOffsets(const char* utf8Text, int** outOffsets, int* outLen) + cls._lib.segmentOffsets.restype = c_int + cls._lib.segmentOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + + # void freeOffsets(int* offsets) + cls._lib.freeOffsets.restype = None + cls._lib.freeOffsets.argtypes = [POINTER(c_int)] + + cls._lib.initJieba() + except Exception as e: + log.debugWarning("Failed to load cppjieba library: %s", e) + cls._lib = None + + @staticmethod + @lru_cache(maxsize=256) + def _callCppjiebaCached(text_utf8: bytes) -> list[int]: + """Module-level cached wrapper to call the C library given utf8 bytes.""" + if ChineseWordSegmentationStrategy._lib is None: + return [] + lib = ChineseWordSegmentationStrategy._lib + charPtr = POINTER(c_int)() + outLen = c_int(0) + try: + res = lib.segmentOffsets(text_utf8, byref(charPtr), byref(outLen)) + if res != 0 or not bool(charPtr): + return [] + n = outLen.value + # read n ints + offsets = [charPtr[i] for i in range(n)] + # free memory allocated by C side + lib.freeOffsets(charPtr) + return offsets + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + lib.freeOffsets(charPtr) + except Exception: + pass + return [] + + @lru_cache(maxsize=128) + def _callCPPJieba(self, text: str) -> list[tuple[int, int]] | None: + data = text.encode('utf-8') + charPtr = POINTER(c_int)() + outLen = c_int() + result = self._lib.segmentOffsets(data, byref(charPtr), byref(outLen)) + if result != 0 or not charPtr: + return [], [] + n = outLen.value + char_offsets = [charPtr[i] for i in range(n)] + self._lib.freeOffsets(charPtr) + return char_offsets + + def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: + wordEnds = self._callCPPJieba(text) + if wordEnds is None or not wordEnds: + return None + index = next((i for i, end in enumerate(wordEnds) if end > offset)) + if index == 0: + start = 0 + else: + start = wordEnds[index - 1] + end = wordEnds[index] if index < len(wordEnds) else len(text) + return (start, end) + + +class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): + """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" + + # Copied from OffsetTextInfos. TODO: optimize + def _calculateUniscribeOffsets( + self, + lineText: str, + #unit: str, + relOffset: int, + ) -> tuple[int, int] | None: + """ + Calculates the bounds of a unit at an offset within a given string of text + using the Windows uniscribe library, also used in Notepad, for example. + Units supported are character and word. + @param lineText: the text string to analyze + @param unit: the TextInfo unit (character or word) + @param relOffset: the character offset within the text string at which to calculate the bounds. + """ + + import NVDAHelper + #if unit is textInfos.UNIT_WORD: + helperFunc = NVDAHelper.localLib.calculateWordOffsets + #elif unit is textInfos.UNIT_CHARACTER: + #helperFunc = NVDAHelper.localLib.calculateCharacterOffsets + #else: + #raise NotImplementedError(f"Unit: {unit}") + relStart = ctypes.c_int() + relEnd = ctypes.c_int() + # uniscribe does some strange things + # when you give it a string with not more than two alphanumeric chars in a row. + # Inject two alphanumeric characters at the end to fix this + uniscribeLineText = lineText + "xx" + # We can't rely on len(lineText) to calculate the length of the line. + offsetConverter = textUtils.WideStringOffsetConverter(lineText) + lineLength = offsetConverter.encodedStringLength + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the str based line offsets to wide string offsets. + relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0] + uniscribeLineLength = lineLength + 2 + if helperFunc( + uniscribeLineText, + uniscribeLineLength, + relOffset, + ctypes.byref(relStart), + ctypes.byref(relEnd), + ): + relStart = relStart.value + relEnd = min(lineLength, relEnd.value) + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the uniscribe based offsets to str offsets. + relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) + return (relStart, relEnd) + log.debugWarning(f"Uniscribe failed to calculate {unit} offsets for text {lineText!r}") + return None + + def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: + return self._calculateUniscribeOffsets(text, offset) From 9f62f04d0476407587328f228f018947dedae88b Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 18 Aug 2025 11:54:41 +0800 Subject: [PATCH 15/93] update `textUtils/__init__.py` add `WordSegmenter` class for word segmentation, integrating segmentation strategies --- source/textUtils/__init__.py | 39 +++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index d88ef055572..375fce953a8 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -1,16 +1,18 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka +# Copyright (C) 2018-2025 NV Access Limited, Babbage B.V., Łukasz Golonka, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Classes and utilities to deal with offsets variable width encodings, particularly utf_16. """ import ctypes +import re import encodings import locale import unicodedata + from abc import ABCMeta, abstractmethod, abstractproperty from functools import cached_property from typing import Generator, Optional, Tuple, Type @@ -18,6 +20,7 @@ from logHandler import log from .uniscribe import splitAtCharacterBoundaries +from . import wordSegment WCHAR_ENCODING = "utf_16_le" UTF8_ENCODING = "utf-8" @@ -540,3 +543,33 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: return ENCODINGS_TO_CONVERTERS[encoding] except IndexError as e: raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e) + +class WordSegmenter: + """Selects appropriate segmentation strategy and segments text.""" + + # Precompiled patterns + # Chinese characters and Japanese kanjis (CJK Unified Ideographs) U+4E00 - U+9FFF + _HANZI = re.compile(r"[\u4E00-\u9FFF]") + # Japanese kanas (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) + _KANA = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") + + def __init__(self, text: str, encoding): + self.text = text + self.encoding = encoding + self.strategy = self._choose_strategy(self.text, self.encoding) + + @staticmethod + def _choose_strategy(text: str, encoding) -> wordSegment.WordSegmentationStrategy: + """Choose the appropriate segmentation strategy based on the text content.""" + if WordSegmenter._HANZI.search(text) and not WordSegmenter._KANA.search(text): + return wordSegment.ChineseWordSegmentationStrategy(text, encoding) + else: + return wordSegment.UniscribeWordSegmentationStrategy(text, encoding) + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + """Get the segment containing the given offset.""" + try: + return self.strategy.getSegmentForOffset(self.text, offset) + except Exception as e: + log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s", e) + return None From 81f2040422b4c3f29b4e8f7980b2df18fde1358f Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 18 Aug 2025 11:55:36 +0800 Subject: [PATCH 16/93] update `textInfos/offsets.py` - replace `useUniscribe` with `useUniscribeForCharOffset` & `useWordSegmenterForWordOffset` for segmentation extensions - integrate `WordSegmenter` for calculating word offsets --- source/textInfos/offsets.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index e67102c85ac..c7acfdc216a 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -1,8 +1,7 @@ -# textInfos/offsets.py # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2006-2024 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from abc import abstractmethod import re @@ -156,8 +155,10 @@ class OffsetsTextInfo(textInfos.TextInfo): #: Honours documentFormatting config option if true - set to false if this is not at all slow. detectFormattingAfterCursorMaybeSlow: bool = True - #: Use uniscribe to calculate word offsets etc. - useUniscribe: bool = True + #: Use uniscribe to calculate character offsets. + useUniscribeForCharOffset: bool = True + #: Use word segmenter to calculate word offsets. + useWordSegmenterForWordOffset: bool = True #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING @@ -382,7 +383,7 @@ def _getCharacterOffsets(self, offset): lineStart, lineEnd = self._getLineOffsets(offset) lineText = self._getTextRange(lineStart, lineEnd) relOffset = offset - lineStart - if self.useUniscribe: + if self.useUniscribeForCharOffset: offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) @@ -406,8 +407,8 @@ def _getWordOffsets(self, offset): # Convert NULL and non-breaking space to space to make sure that words will break on them lineText = lineText.translate({0: " ", 0xA0: " "}) relOffset = offset - lineStart - if self.useUniscribe: - offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_WORD, relOffset) + if self.useWordSegmenterForWordOffset: + offsets = textUtils.WordSegmenter(lineText, self.encoding).getSegmentForOffset(relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) # Fall back to the older word offsets detection that only breaks on non alphanumeric From da64cd88a6b8062fc7506f83d360b9072f4ff123 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 18 Aug 2025 12:02:49 +0800 Subject: [PATCH 17/93] update `displayModel.py` make `DisplayModelTextInfo`'s flag aligned with its superclass --- source/displayModel.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/source/displayModel.py b/source/displayModel.py index d81d878f03d..daaed917098 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2006-2022 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Joseph Lee, Cyrille Bougot, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt import ctypes from ctypes import * # noqa: F403 @@ -545,7 +545,8 @@ def _getStoryLength(self): return lineEndOffsets[-1] return 0 - useUniscribe = False + useUniscribeForCharOffset = False + useWordSegmenterForWordOffset = False def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) From 557f404368c331d8dfa5f2fbbe54de604118366c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Aug 2025 04:09:20 +0000 Subject: [PATCH 18/93] Pre-commit auto-fix --- source/textUtils/__init__.py | 5 +++-- source/textUtils/wordSegment.py | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 375fce953a8..9a06ae4fd4a 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -544,6 +544,7 @@ def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: except IndexError as e: raise LookupError(f"Don't know how to deal with encoding '{encoding}'", e) + class WordSegmenter: """Selects appropriate segmentation strategy and segments text.""" @@ -571,5 +572,5 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: try: return self.strategy.getSegmentForOffset(self.text, offset) except Exception as e: - log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s", e) - return None + log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s", e) + return None diff --git a/source/textUtils/wordSegment.py b/source/textUtils/wordSegment.py index 94f82fb860d..e2dc3d3bd85 100644 --- a/source/textUtils/wordSegment.py +++ b/source/textUtils/wordSegment.py @@ -11,6 +11,7 @@ from logHandler import log import textUtils + class WordSegmentationStrategy(ABC): """Abstract base class for word segmentation strategies.""" @@ -19,7 +20,7 @@ def __init__(self, text: str, encoding: str | None = None): self.encoding = encoding @abstractmethod - def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: # TODO: optimize + def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: # TODO: optimize """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" pass @@ -32,11 +33,12 @@ def __init__(self, text: str, encoding: str | None = None): self._ensureLibLoaded() @classmethod - def _ensureLibLoaded(cls): # TODO: make cppjieba alternative + def _ensureLibLoaded(cls): # TODO: make cppjieba alternative if cls._lib is not None: return try: from NVDAHelper import versionedLibPath + lib_path = os.path.join(versionedLibPath, "cppjieba.dll") cls._lib = ctypes.cdll.LoadLibrary(lib_path) # Setup function signatures (adjust if your C API differs) @@ -86,7 +88,7 @@ def _callCppjiebaCached(text_utf8: bytes) -> list[int]: @lru_cache(maxsize=128) def _callCPPJieba(self, text: str) -> list[tuple[int, int]] | None: - data = text.encode('utf-8') + data = text.encode("utf-8") charPtr = POINTER(c_int)() outLen = c_int() result = self._lib.segmentOffsets(data, byref(charPtr), byref(outLen)) @@ -117,7 +119,7 @@ class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): def _calculateUniscribeOffsets( self, lineText: str, - #unit: str, + # unit: str, relOffset: int, ) -> tuple[int, int] | None: """ @@ -130,12 +132,13 @@ def _calculateUniscribeOffsets( """ import NVDAHelper - #if unit is textInfos.UNIT_WORD: + + # if unit is textInfos.UNIT_WORD: helperFunc = NVDAHelper.localLib.calculateWordOffsets - #elif unit is textInfos.UNIT_CHARACTER: - #helperFunc = NVDAHelper.localLib.calculateCharacterOffsets - #else: - #raise NotImplementedError(f"Unit: {unit}") + # elif unit is textInfos.UNIT_CHARACTER: + # helperFunc = NVDAHelper.localLib.calculateCharacterOffsets + # else: + # raise NotImplementedError(f"Unit: {unit}") relStart = ctypes.c_int() relEnd = ctypes.c_int() # uniscribe does some strange things From f72d3488e8ec217ea267ab42559442653cbcabeb Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 18 Aug 2025 15:06:19 +0800 Subject: [PATCH 19/93] update type annotations --- source/textUtils/__init__.py | 8 ++++---- source/textUtils/wordSegment.py | 10 ++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 9a06ae4fd4a..9cb91c90bce 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -550,17 +550,17 @@ class WordSegmenter: # Precompiled patterns # Chinese characters and Japanese kanjis (CJK Unified Ideographs) U+4E00 - U+9FFF - _HANZI = re.compile(r"[\u4E00-\u9FFF]") + _HANZI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]") # Japanese kanas (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) - _KANA = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") + _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") - def __init__(self, text: str, encoding): + def __init__(self, text: str, encoding: str | None): self.text = text self.encoding = encoding self.strategy = self._choose_strategy(self.text, self.encoding) @staticmethod - def _choose_strategy(text: str, encoding) -> wordSegment.WordSegmentationStrategy: + def _choose_strategy(text: str, encoding: str | None) -> wordSegment.WordSegmentationStrategy: """Choose the appropriate segmentation strategy based on the text content.""" if WordSegmenter._HANZI.search(text) and not WordSegmenter._KANA.search(text): return wordSegment.ChineseWordSegmentationStrategy(text, encoding) diff --git a/source/textUtils/wordSegment.py b/source/textUtils/wordSegment.py index e2dc3d3bd85..3cee159d049 100644 --- a/source/textUtils/wordSegment.py +++ b/source/textUtils/wordSegment.py @@ -119,7 +119,6 @@ class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): def _calculateUniscribeOffsets( self, lineText: str, - # unit: str, relOffset: int, ) -> tuple[int, int] | None: """ @@ -127,18 +126,13 @@ def _calculateUniscribeOffsets( using the Windows uniscribe library, also used in Notepad, for example. Units supported are character and word. @param lineText: the text string to analyze - @param unit: the TextInfo unit (character or word) @param relOffset: the character offset within the text string at which to calculate the bounds. """ import NVDAHelper - # if unit is textInfos.UNIT_WORD: helperFunc = NVDAHelper.localLib.calculateWordOffsets - # elif unit is textInfos.UNIT_CHARACTER: - # helperFunc = NVDAHelper.localLib.calculateCharacterOffsets - # else: - # raise NotImplementedError(f"Unit: {unit}") + relStart = ctypes.c_int() relEnd = ctypes.c_int() # uniscribe does some strange things @@ -165,7 +159,7 @@ def _calculateUniscribeOffsets( # We need to convert the uniscribe based offsets to str offsets. relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) return (relStart, relEnd) - log.debugWarning(f"Uniscribe failed to calculate {unit} offsets for text {lineText!r}") + log.debugWarning(f"Uniscribe failed to calculate word offsets for text {lineText!r}") return None def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: From adc22fb9ab7bbba0abd79a0f46c7c09d809faf76 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 20 Aug 2025 19:22:28 +0800 Subject: [PATCH 20/93] add wrapper for word manager --- nvdaHelper/cppjieba/cppjieba.cpp | 12 ++++++++++++ nvdaHelper/cppjieba/cppjieba.def | 3 +++ nvdaHelper/cppjieba/cppjieba.hpp | 5 +++++ nvdaHelper/cppjieba/sconscript | 5 +++++ 4 files changed, 25 insertions(+) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index ccd972385e9..503087483be 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -71,6 +71,18 @@ int segmentOffsets(const char* text, int** charOffsets, int* outLen) { return 0; } +bool insertUserWord(const string& word, int freq, const string& tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().InsertUserWord(word, freq, tag); +} + +bool find(const string& word) { + return JiebaSingleton::getInstance().Find(word); +} + +bool deleteUserWord(const string& word, const string& tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().DeleteUserWord(word, tag); +} + void freeOffsets(int* ptr) { if (ptr) free(ptr); } diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def index cc2246d0f21..4a7c6b90744 100644 --- a/nvdaHelper/cppjieba/cppjieba.def +++ b/nvdaHelper/cppjieba/cppjieba.def @@ -2,4 +2,7 @@ LIBRARY cppjieba EXPORTS initJieba segmentOffsets + insertUserWord + find + deleteUserWord freeOffsets diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index c5f0bad4af7..6d1adbc3e4e 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -56,6 +56,11 @@ JIEBA_API int initJieba(); /// @return 0 on success, -1 on failure. JIEBA_API int segmentOffsets(const char* text, int** charOffsets, int* outLen); +/// Wrapper for word management +JIEBA_API bool insertUserWord(const string& word, int freq, const string& tag); +JIEBA_API bool find(const string& word); +JIEBA_API bool deleteUserWord(const string& word, const string& tag); + /// @brief Free memory allocated by segmentOffsets. JIEBA_API void freeOffsets(int* ptr); diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 1a1fd12a2c2..9f1d63452da 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -35,6 +35,11 @@ sourceFiles = [ "cppjieba.def", ] +env.AppendUnique( + CCFLAGS=['/wd4819'], + CXXFLAGS=['/wd4819'], +) + cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning From 4adac07353717779833fadada86b14ea50edc7b5 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 20 Aug 2025 21:22:52 +0800 Subject: [PATCH 21/93] update the word segmentation structure - redesign 2 properties of 'OffsetTextInfo' as enums to make them more configurable, inspired by @LeonarddeR - override them in some subclasses to simulate specific behaviors --- source/NVDAObjects/window/edit.py | 6 ++++++ source/displayModel.py | 6 ++++-- source/textInfos/offsets.py | 14 +++++++------- source/textUtils/__init__.py | 28 ++++++++++++++++++---------- source/textUtils/segFlag.py | 19 +++++++++++++++++++ 5 files changed, 54 insertions(+), 19 deletions(-) create mode 100644 source/textUtils/segFlag.py diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index bb1018f7c12..b842154066d 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -34,6 +34,7 @@ import watchdog import locationHelper import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag selOffsetsAtLastCaretEvent = None @@ -169,6 +170,11 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): + + # Override segFlags to enforce use of Uniscribe + charSegFlag = CharSegFlag.UNISCRIBE + wordSegFlag = WordSegFlag.UNISCRIBE + def _getPointFromOffset(self, offset): if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3: processHandle = self.obj.processHandle diff --git a/source/displayModel.py b/source/displayModel.py index daaed917098..ee98d1e7094 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -23,6 +23,7 @@ import windowUtils from locationHelper import RectLTRB, RectLTWH import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from typing import ( List, Tuple, @@ -545,8 +546,9 @@ def _getStoryLength(self): return lineEndOffsets[-1] return 0 - useUniscribeForCharOffset = False - useWordSegmenterForWordOffset = False + # Override segFlags to strictly use the old fallen-back method + charSegFlag = CharSegFlag.NONE + wordSegFlag = WordSegFlag.NONE def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index c7acfdc216a..8284797de71 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -13,6 +13,7 @@ import locationHelper from treeInterceptorHandler import TreeInterceptor import textUtils +from textUtils.segFlag import CharSegFlag, WordSegFlag from dataclasses import dataclass from typing import ( Optional, @@ -155,10 +156,9 @@ class OffsetsTextInfo(textInfos.TextInfo): #: Honours documentFormatting config option if true - set to false if this is not at all slow. detectFormattingAfterCursorMaybeSlow: bool = True - #: Use uniscribe to calculate character offsets. - useUniscribeForCharOffset: bool = True - #: Use word segmenter to calculate word offsets. - useWordSegmenterForWordOffset: bool = True + #: Method to calculate character and word offsets. + charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE + wordSegFlag: WordSegFlag = WordSegFlag.ON_SEGMENTER #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING @@ -383,7 +383,7 @@ def _getCharacterOffsets(self, offset): lineStart, lineEnd = self._getLineOffsets(offset) lineText = self._getTextRange(lineStart, lineEnd) relOffset = offset - lineStart - if self.useUniscribeForCharOffset: + if self.charSegFlag == CharSegFlag.UNISCRIBE: offsets = self._calculateUniscribeOffsets(lineText, textInfos.UNIT_CHARACTER, relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) @@ -407,8 +407,8 @@ def _getWordOffsets(self, offset): # Convert NULL and non-breaking space to space to make sure that words will break on them lineText = lineText.translate({0: " ", 0xA0: " "}) relOffset = offset - lineStart - if self.useWordSegmenterForWordOffset: - offsets = textUtils.WordSegmenter(lineText, self.encoding).getSegmentForOffset(relOffset) + if self.wordSegFlag: + offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset(relOffset) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) # Fall back to the older word offsets detection that only breaks on non alphanumeric diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 9cb91c90bce..13bdc58575b 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -21,6 +21,7 @@ from .uniscribe import splitAtCharacterBoundaries from . import wordSegment +from .segFlag import WordSegFlag WCHAR_ENCODING = "utf_16_le" UTF8_ENCODING = "utf-8" @@ -549,23 +550,30 @@ class WordSegmenter: """Selects appropriate segmentation strategy and segments text.""" # Precompiled patterns - # Chinese characters and Japanese kanjis (CJK Unified Ideographs) U+4E00 - U+9FFF - _HANZI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]") - # Japanese kanas (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) + # Chinese characters and Japanese kanji (CJK Unified Ideographs U+4E00 - U+9FFF) + _CHINESE_CHARACTER_AND_JAPANESE_KANJI: re.Pattern = re.compile(r"[\u4E00-\u9FFF]") + # Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") - def __init__(self, text: str, encoding: str | None): + def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): self.text = text self.encoding = encoding - self.strategy = self._choose_strategy(self.text, self.encoding) + self.wordSegFlag = wordSegFlag + self.strategy = self._choose_strategy() - @staticmethod - def _choose_strategy(text: str, encoding: str | None) -> wordSegment.WordSegmentationStrategy: + def _choose_strategy(self) -> wordSegment.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" - if WordSegmenter._HANZI.search(text) and not WordSegmenter._KANA.search(text): - return wordSegment.ChineseWordSegmentationStrategy(text, encoding) + if self.wordSegFlag == WordSegFlag.ON_SEGMENTER: + if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(self.text) and not WordSegmenter._KANA.search(self.text): + return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + return wordSegment.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: - return wordSegment.UniscribeWordSegmentationStrategy(text, encoding) + match self.wordSegFlag: + case WordSegFlag.UNISCRIBE: + return wordSegment.UniscribeWordSegmentationStrategy(self.text, self.encoding) + case WordSegFlag.CHINESE: + return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py new file mode 100644 index 00000000000..84e3482e28c --- /dev/null +++ b/source/textUtils/segFlag.py @@ -0,0 +1,19 @@ +from enum import IntFlag + +# shared bit masks (explicit powers of two) +_ON_SEGMENTER = 1 << 0 +_UNISCRIBE = 1 << 1 +_CHINESE = 1 << 2 + +class CharSegFlag(IntFlag): + """Character-level segmentation flags.""" + NONE = 0 + ON_SEGMENTER = _ON_SEGMENTER + UNISCRIBE = _UNISCRIBE + +class WordSegFlag(IntFlag): + """Word-level segmentation flags.""" + NONE = 0 + ON_SEGMENTER = _ON_SEGMENTER + UNISCRIBE = _UNISCRIBE + CHINESE = _CHINESE From 0d40f0a1fe4349cc6dca5d5fa3c607765232e69b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Aug 2025 13:31:57 +0000 Subject: [PATCH 22/93] Pre-commit auto-fix --- source/NVDAObjects/window/edit.py | 1 - source/textInfos/offsets.py | 4 +++- source/textUtils/__init__.py | 4 +++- source/textUtils/segFlag.py | 22 +++++++++++++--------- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index b842154066d..718d48d45be 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -170,7 +170,6 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): - # Override segFlags to enforce use of Uniscribe charSegFlag = CharSegFlag.UNISCRIBE wordSegFlag = WordSegFlag.UNISCRIBE diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 09ca96268a3..257fec0ad6e 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -403,7 +403,9 @@ def _getWordOffsets(self, offset): lineText = lineText.translate({0: " ", 0xA0: " "}) relOffset = offset - lineStart if self.wordSegFlag: - offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset(relOffset) + offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset( + relOffset + ) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) # Fall back to the older word offsets detection that only breaks on non alphanumeric diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 13bdc58575b..f037c06cf17 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -564,7 +564,9 @@ def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): def _choose_strategy(self) -> wordSegment.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.ON_SEGMENTER: - if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(self.text) and not WordSegmenter._KANA.search(self.text): + if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( + self.text + ) and not WordSegmenter._KANA.search(self.text): return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) else: return wordSegment.UniscribeWordSegmentationStrategy(self.text, self.encoding) diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 84e3482e28c..1a62f1aae21 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -5,15 +5,19 @@ _UNISCRIBE = 1 << 1 _CHINESE = 1 << 2 + class CharSegFlag(IntFlag): - """Character-level segmentation flags.""" - NONE = 0 - ON_SEGMENTER = _ON_SEGMENTER - UNISCRIBE = _UNISCRIBE + """Character-level segmentation flags.""" + + NONE = 0 + ON_SEGMENTER = _ON_SEGMENTER + UNISCRIBE = _UNISCRIBE + class WordSegFlag(IntFlag): - """Word-level segmentation flags.""" - NONE = 0 - ON_SEGMENTER = _ON_SEGMENTER - UNISCRIBE = _UNISCRIBE - CHINESE = _CHINESE + """Word-level segmentation flags.""" + + NONE = 0 + ON_SEGMENTER = _ON_SEGMENTER + UNISCRIBE = _UNISCRIBE + CHINESE = _CHINESE From 676fc42fdd543026c9430e97bfc359e9f321c0d4 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 09:51:24 +0800 Subject: [PATCH 23/93] add copyright header --- source/textUtils/segFlag.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 1a62f1aae21..435466a7a1c 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -1,3 +1,8 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + from enum import IntFlag # shared bit masks (explicit powers of two) From ddd48e86dc818a7a1853c20f46271c270b9bc629 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 10:33:28 +0800 Subject: [PATCH 24/93] add type annotations --- source/textUtils/__init__.py | 8 ++++---- source/textUtils/segFlag.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index f037c06cf17..22eb63ee096 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -556,10 +556,10 @@ class WordSegmenter: _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): - self.text = text - self.encoding = encoding - self.wordSegFlag = wordSegFlag - self.strategy = self._choose_strategy() + self.text: str = text + self.encoding: str | None = encoding + self.wordSegFlag: WordSegFlag = wordSegFlag + self.strategy: wordSegment.WordSegmentationStrategy = self._choose_strategy() def _choose_strategy(self) -> wordSegment.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 435466a7a1c..48f1b58dd01 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -6,23 +6,23 @@ from enum import IntFlag # shared bit masks (explicit powers of two) -_ON_SEGMENTER = 1 << 0 -_UNISCRIBE = 1 << 1 -_CHINESE = 1 << 2 +_ON_SEGMENTER: int = 1 << 0 +_UNISCRIBE: int = 1 << 1 +_CHINESE: int = 1 << 2 class CharSegFlag(IntFlag): """Character-level segmentation flags.""" - NONE = 0 - ON_SEGMENTER = _ON_SEGMENTER - UNISCRIBE = _UNISCRIBE + NONE: int = 0 + ON_SEGMENTER: int = _ON_SEGMENTER + UNISCRIBE: int = _UNISCRIBE class WordSegFlag(IntFlag): """Word-level segmentation flags.""" - NONE = 0 - ON_SEGMENTER = _ON_SEGMENTER - UNISCRIBE = _UNISCRIBE - CHINESE = _CHINESE + NONE: int = 0 + ON_SEGMENTER: int = _ON_SEGMENTER + UNISCRIBE: int = _UNISCRIBE + CHINESE: int = _CHINESE From 3c65868cbb159d05a8a93ab59b714824d1c3d250 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 10:36:16 +0800 Subject: [PATCH 25/93] update log --- source/textUtils/__init__.py | 2 +- source/textUtils/wordSegment.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 22eb63ee096..420f3245784 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -582,5 +582,5 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: try: return self.strategy.getSegmentForOffset(self.text, offset) except Exception as e: - log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s", e) + log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", e, self.text, offset, self.strategy) return None diff --git a/source/textUtils/wordSegment.py b/source/textUtils/wordSegment.py index 3cee159d049..07bff52af04 100644 --- a/source/textUtils/wordSegment.py +++ b/source/textUtils/wordSegment.py @@ -159,7 +159,6 @@ def _calculateUniscribeOffsets( # We need to convert the uniscribe based offsets to str offsets. relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) return (relStart, relEnd) - log.debugWarning(f"Uniscribe failed to calculate word offsets for text {lineText!r}") return None def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: From d69e8b7cc9515a6ac5374ad1f4250b952a3a1b21 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 10:44:37 +0800 Subject: [PATCH 26/93] add trailing commas in multi-line constructs --- source/textInfos/offsets.py | 2 +- source/textUtils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 257fec0ad6e..a5c175260bd 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -404,7 +404,7 @@ def _getWordOffsets(self, offset): relOffset = offset - lineStart if self.wordSegFlag: offsets = textUtils.WordSegmenter(lineText, self.encoding, self.wordSegFlag).getSegmentForOffset( - relOffset + relOffset, ) if offsets is not None: return (offsets[0] + lineStart, offsets[1] + lineStart) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 420f3245784..225aa4fb6d1 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -565,7 +565,7 @@ def _choose_strategy(self) -> wordSegment.WordSegmentationStrategy: # TODO: opt """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.ON_SEGMENTER: if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( - self.text + self.text, ) and not WordSegmenter._KANA.search(self.text): return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) else: From 8244a76e0e795b2dc0140d089f4d147200576216 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 11:04:44 +0800 Subject: [PATCH 27/93] make wordSegment module to make file structure clearer - create `wordSeg` package - migrate wordSegment module into wordSeg package and rename to wordSeg --- source/textUtils/__init__.py | 14 +++++++------- source/textUtils/wordSeg/__init__.py | 0 .../{wordSegment.py => wordSeg/wordSegStrategy.py} | 0 3 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 source/textUtils/wordSeg/__init__.py rename source/textUtils/{wordSegment.py => wordSeg/wordSegStrategy.py} (100%) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 225aa4fb6d1..1631bc07fda 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -20,7 +20,7 @@ from logHandler import log from .uniscribe import splitAtCharacterBoundaries -from . import wordSegment +from .wordSeg import wordSegStrategy from .segFlag import WordSegFlag WCHAR_ENCODING = "utf_16_le" @@ -559,23 +559,23 @@ def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): self.text: str = text self.encoding: str | None = encoding self.wordSegFlag: WordSegFlag = wordSegFlag - self.strategy: wordSegment.WordSegmentationStrategy = self._choose_strategy() + self.strategy: wordSegStrategy.WordSegmentationStrategy = self._choose_strategy() - def _choose_strategy(self) -> wordSegment.WordSegmentationStrategy: # TODO: optimize + def _choose_strategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.ON_SEGMENTER: if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( self.text, ) and not WordSegmenter._KANA.search(self.text): - return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) else: - return wordSegment.UniscribeWordSegmentationStrategy(self.text, self.encoding) + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: match self.wordSegFlag: case WordSegFlag.UNISCRIBE: - return wordSegment.UniscribeWordSegmentationStrategy(self.text, self.encoding) + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case WordSegFlag.CHINESE: - return wordSegment.ChineseWordSegmentationStrategy(self.text, self.encoding) + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/source/textUtils/wordSegment.py b/source/textUtils/wordSeg/wordSegStrategy.py similarity index 100% rename from source/textUtils/wordSegment.py rename to source/textUtils/wordSeg/wordSegStrategy.py From 3f54d62b02c46e1741148c75da44a477ab7e8fab Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 21 Aug 2025 14:59:22 +0800 Subject: [PATCH 28/93] add initialization logic to wordSeg module - add an decorator to easily add initializers - extract `cppjieba`'s initializer to fit the decorator --- source/core.py | 12 +++++- source/textUtils/wordSeg/__init__.py | 44 +++++++++++++++++++++ source/textUtils/wordSeg/wordSegStrategy.py | 44 ++++++++++++++++++--- 3 files changed, 94 insertions(+), 6 deletions(-) diff --git a/source/core.py b/source/core.py index 25a34bb9be2..9aab7a3fa5a 100644 --- a/source/core.py +++ b/source/core.py @@ -1,6 +1,6 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2025 NV Access Limited, Aleksey Sadovoy, Christopher Toth, Joseph Lee, Peter Vágner, -# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt +# Derek Riemer, Babbage B.V., Zahari Yurukov, Łukasz Golonka, Cyrille Bougot, Julien Cochuyt, Wang Chong # This file is covered by the GNU General Public License. # See the file COPYING for more details. @@ -906,6 +906,16 @@ def main(): _remoteClient.initialize() + from textUtils import wordSeg + log.debug("Initializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except: # noqa: E722 + log.error("Error initializing word segmentation module", exc_info=True) + if globalVars.appArgs.install or globalVars.appArgs.installSilent: import gui.installerGui diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py index e69de29bb2d..496e6f3b054 100644 --- a/source/textUtils/wordSeg/__init__.py +++ b/source/textUtils/wordSeg/__init__.py @@ -0,0 +1,44 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import importlib +from logHandler import log + +def initialize(): + """ + Call all registered initializer functions recorded in wordSegStrategy.initializerList. + + Each entry is a tuple: (module_name, qualname, func_obj, args, kwargs). + We try to resolve the callable from the module and qualname at runtime + (this handles classmethod/staticmethod wrapping order). If resolution fails, + we fall back to the stored func_obj. + + Exceptions from individual initializers are caught and logged so that one + failing initializer doesn't stop the rest. + """ + + from . import wordSegStrategy + + for module_name, qualname, func_obj, args, kwargs in getattr(wordSegStrategy, "initializerList", []): + callable_to_call = None + # try to resolve module + qualname to a current attribute (handles classmethod/staticmethod) + try: + mod = importlib.import_module(module_name) + obj = mod + for part in qualname.split("."): + obj = getattr(obj, part) + callable_to_call = obj + except Exception: + # fallback to original function object captured during decoration + callable_to_call = func_obj + + # Final call with its args/kwargs and exception handling + try: + if not callable(callable_to_call): + raise TypeError(f"Resolved initializer is not callable: {module_name}.{qualname}") + callable_to_call(*args, **kwargs) + except Exception as e: + log.debug("Initializer %s.%s failed: %s", module_name, qualname, e) + return diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 07bff52af04..0ef0838d64f 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -8,9 +8,40 @@ from ctypes import c_char_p, c_int, POINTER, byref from abc import ABC, abstractmethod from functools import lru_cache +from collections.abc import Callable +from typing import Any + from logHandler import log import textUtils +# Initializer registry (robust: saves module + qualname + original function + args/kwargs) +# Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict) +initializerList: list[tuple[str, str, Callable[..., Any], tuple[Any, ...], dict[str, Any]]] = [] + +def initializerRegistry(*decorator_args, **decorator_kwargs): + """ + A decorator to register an initializer function. + Usage: + @initializerRegistry + def f(): ... + or with arguments: + @initializerRegistry(arg1, arg2, kw=val) + def f(...): ... + We save (func.__module__, func.__qualname__, func, args, kwargs) so that during + package initialize() we can dynamically resolve the callable from the module + (this handles classmethod/staticmethod ordering issues). + """ + if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs: + func = decorator_args[0] + initializerList.append((func.__module__, func.__qualname__, func, (), {})) + return func + + def _decorator(func: Callable[..., Any]): + initializerList.append((func.__module__, func.__qualname__, func, decorator_args, decorator_kwargs)) + return func + + return _decorator + class WordSegmentationStrategy(ABC): """Abstract base class for word segmentation strategies.""" @@ -28,12 +59,13 @@ def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: class ChineseWordSegmentationStrategy(WordSegmentationStrategy): _lib = None - def __init__(self, text: str, encoding: str | None = None): - super().__init__(text, encoding) - self._ensureLibLoaded() - @classmethod + @initializerRegistry def _ensureLibLoaded(cls): # TODO: make cppjieba alternative + """ + Class-level initializer: attempts to load the versioned cppjieba library and + set up ctypes signatures. + """ if cls._lib is not None: return try: @@ -41,7 +73,9 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative lib_path = os.path.join(versionedLibPath, "cppjieba.dll") cls._lib = ctypes.cdll.LoadLibrary(lib_path) - # Setup function signatures (adjust if your C API differs) + + # Setup function signatures + # int initJieba() cls._lib.initJieba.restype = c_int cls._lib.initJieba.argtypes = [] From 38b4bea46858185061c6009730a5333760e453f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 Aug 2025 07:00:36 +0000 Subject: [PATCH 29/93] Pre-commit auto-fix --- source/core.py | 1 + source/textUtils/__init__.py | 8 +++++++- source/textUtils/wordSeg/__init__.py | 1 + source/textUtils/wordSeg/wordSegStrategy.py | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/source/core.py b/source/core.py index 9aab7a3fa5a..fd0b5ebadc3 100644 --- a/source/core.py +++ b/source/core.py @@ -907,6 +907,7 @@ def main(): _remoteClient.initialize() from textUtils import wordSeg + log.debug("Initializing word segmentation module") try: diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 1631bc07fda..d6a3abe181e 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -582,5 +582,11 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: try: return self.strategy.getSegmentForOffset(self.text, offset) except Exception as e: - log.debugWarning("WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", e, self.text, offset, self.strategy) + log.debugWarning( + "WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", + e, + self.text, + offset, + self.strategy, + ) return None diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py index 496e6f3b054..7fd3b8e23db 100644 --- a/source/textUtils/wordSeg/__init__.py +++ b/source/textUtils/wordSeg/__init__.py @@ -6,6 +6,7 @@ import importlib from logHandler import log + def initialize(): """ Call all registered initializer functions recorded in wordSegStrategy.initializerList. diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 0ef0838d64f..84750b5b799 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -18,6 +18,7 @@ # Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict) initializerList: list[tuple[str, str, Callable[..., Any], tuple[Any, ...], dict[str, Any]]] = [] + def initializerRegistry(*decorator_args, **decorator_kwargs): """ A decorator to register an initializer function. From eeb96aa73eafae79372e048b7b6df789d6336380 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 23 Aug 2025 20:33:54 +0800 Subject: [PATCH 30/93] use multithreading for cppjieba's initialization --- source/textUtils/wordSeg/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py index 7fd3b8e23db..77231b58fa3 100644 --- a/source/textUtils/wordSeg/__init__.py +++ b/source/textUtils/wordSeg/__init__.py @@ -21,8 +21,9 @@ def initialize(): """ from . import wordSegStrategy + from threading import Thread - for module_name, qualname, func_obj, args, kwargs in getattr(wordSegStrategy, "initializerList", []): + for module_name, qualname, func_obj, args, kwargs in wordSegStrategy.initializerList: callable_to_call = None # try to resolve module + qualname to a current attribute (handles classmethod/staticmethod) try: @@ -39,7 +40,7 @@ def initialize(): try: if not callable(callable_to_call): raise TypeError(f"Resolved initializer is not callable: {module_name}.{qualname}") - callable_to_call(*args, **kwargs) + Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start() except Exception as e: log.debug("Initializer %s.%s failed: %s", module_name, qualname, e) return From 3ba56f0a517ec66a26fb048cd568767c5dcff042 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 23 Aug 2025 23:25:47 +0800 Subject: [PATCH 31/93] add configuration for word navigation users can find it in NVDA settings -> Document Navigation -> Word Segmentation Method --- source/NVDAObjects/window/edit.py | 10 ++++++---- source/config/configSpec.py | 8 +++++--- source/config/featureFlagEnums.py | 26 +++++++++++++++++++++++--- source/displayModel.py | 4 +++- source/gui/settingsDialogs.py | 17 +++++++++++++++-- source/textInfos/offsets.py | 23 ++++++++++++++++++++++- source/textUtils/__init__.py | 2 +- source/textUtils/segFlag.py | 6 +++--- 8 files changed, 78 insertions(+), 18 deletions(-) diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index 718d48d45be..a1a3a5820a3 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2023 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Cyrille Bougot, Leonard de Ruijter, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from typing import ( Dict, @@ -172,7 +172,9 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): # Override segFlags to enforce use of Uniscribe charSegFlag = CharSegFlag.UNISCRIBE - wordSegFlag = WordSegFlag.UNISCRIBE + @property + def wordSegFlag(self): + return WordSegFlag.UNISCRIBE def _getPointFromOffset(self, offset): if self.obj.editAPIVersion == 1 or self.obj.editAPIVersion >= 3: diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 69864c00ad2..b15c9ff7c45 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -1,9 +1,10 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Davy Kager, Bill Dengler, Julien Cochuyt, # Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith, -# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, +# Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from io import StringIO from configobj import ConfigObj @@ -257,6 +258,7 @@ reportClickable = boolean(default=true) [documentNavigation] + wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Uniscribe") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") [reviewCursor] diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 5bcb1db1fdb..5b6b1914a21 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2022 NV Access Limited, Bill Dengler, Rob Meredith -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Copyright (C) 2022-2025 NV Access Limited, Bill Dengler, Rob Meredith, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Feature flag value enumerations. @@ -139,6 +139,26 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: } +class WordNavigationUnitFlag(DisplayStringEnum): + """Enumeration for word navigation.""" + + @property + def _displayStringLabels(self): + return { + # Translators: Label for a method of word segmentation. + self.AUTO: _("Auto"), + # Translators: Label for a method of word segmentation. + self.UNISCRIBE: _("Uniscribe"), + # Translators: Label for a method of word segmentation. + self.CHINESE: _("Chinese"), + } + + DEFAULT = enum.auto() + AUTO = enum.auto() + UNISCRIBE = enum.auto() + CHINESE = enum.auto() + + def getAvailableEnums() -> typing.Generator[typing.Tuple[str, FlagValueEnum], None, None]: for name, value in globals().items(): if ( diff --git a/source/displayModel.py b/source/displayModel.py index ee98d1e7094..5e1674ccd3a 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -548,7 +548,9 @@ def _getStoryLength(self): # Override segFlags to strictly use the old fallen-back method charSegFlag = CharSegFlag.NONE - wordSegFlag = WordSegFlag.NONE + @property + def wordSegFlag(self): + return WordSegFlag.NONE def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index a74ce64a32e..8b94e0e5989 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -6,8 +6,9 @@ # Łukasz Golonka, Aaron Cannon, Adriani90, André-Abush Clause, Dawid Pieper, # Takuya Nishimoto, jakubl7545, Tony Malykh, Rob Meredith, # Burman's Computer and Education Ltd, hwf1324, Cary-rowen, Christopher Proß. -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from collections.abc import Container import logging @@ -3073,6 +3074,17 @@ class DocumentNavigationPanel(SettingsPanel): def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) + + # Translators: This is a label for the word segmentation standard in the document navigation dialog + WordNavigationUnitLabel = _("&Word Segmentation Standard:") + self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( + labelText=WordNavigationUnitLabel, + wxCtrlClass=nvdaControls.FeatureFlagCombo, + keyPath=["documentNavigation", "wordSegmentationStandard"], + conf=config.conf, + ) + self.bindHelpEvent("wordSegmentationStandard", self.wordSegCombo) + # Translators: This is a label for the paragraph navigation style in the document navigation dialog paragraphStyleLabel = _("&Paragraph style:") self.paragraphStyleCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( @@ -3084,6 +3096,7 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): + self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index a5c175260bd..31b427fd6d6 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -4,19 +4,25 @@ # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from abc import abstractmethod +from logging import ERROR import re import ctypes import unicodedata import NVDAHelper import config +import config.featureFlagEnums +import documentBase +import logHandler import textInfos import locationHelper from treeInterceptorHandler import TreeInterceptor import textUtils from textUtils.segFlag import CharSegFlag, WordSegFlag +import config from dataclasses import dataclass from typing import ( Optional, + Union, Tuple, Dict, List, @@ -158,7 +164,20 @@ class OffsetsTextInfo(textInfos.TextInfo): detectFormattingAfterCursorMaybeSlow: bool = True #: Method to calculate character and word offsets. charSegFlag: CharSegFlag = CharSegFlag.UNISCRIBE - wordSegFlag: WordSegFlag = WordSegFlag.ON_SEGMENTER + + @property + def wordSegFlag(self) -> WordSegFlag | None: + match self.wordSegConf.calculated(): + case config.featureFlagEnums.WordNavigationUnitFlag.UNISCRIBE: + return WordSegFlag.UNISCRIBE + case config.featureFlagEnums.WordNavigationUnitFlag.AUTO: + return WordSegFlag.AUTO + case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: + return WordSegFlag.CHINESE + case _: + from logHandler import log + log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") + #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING @@ -479,6 +498,8 @@ def __init__(self, obj, position): Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards. """ super(OffsetsTextInfo, self).__init__(obj, position) + self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"]["wordSegmentationStandard"] + from NVDAObjects import NVDAObject if isinstance(position, locationHelper.Point): diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index d6a3abe181e..26c44922ee1 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -563,7 +563,7 @@ def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): def _choose_strategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" - if self.wordSegFlag == WordSegFlag.ON_SEGMENTER: + if self.wordSegFlag == WordSegFlag.AUTO: if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( self.text, ) and not WordSegmenter._KANA.search(self.text): diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 48f1b58dd01..72153c80e18 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -6,7 +6,7 @@ from enum import IntFlag # shared bit masks (explicit powers of two) -_ON_SEGMENTER: int = 1 << 0 +_AUTO: int = 1 << 0 _UNISCRIBE: int = 1 << 1 _CHINESE: int = 1 << 2 @@ -15,7 +15,7 @@ class CharSegFlag(IntFlag): """Character-level segmentation flags.""" NONE: int = 0 - ON_SEGMENTER: int = _ON_SEGMENTER + AUTO: int = _AUTO UNISCRIBE: int = _UNISCRIBE @@ -23,6 +23,6 @@ class WordSegFlag(IntFlag): """Word-level segmentation flags.""" NONE: int = 0 - ON_SEGMENTER: int = _ON_SEGMENTER + AUTO: int = _AUTO UNISCRIBE: int = _UNISCRIBE CHINESE: int = _CHINESE From 356c11c0a75359df6dea30d6a11a0e6dc00bea1b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 23 Aug 2025 15:35:45 +0000 Subject: [PATCH 32/93] Pre-commit auto-fix --- source/NVDAObjects/window/edit.py | 1 + source/displayModel.py | 1 + source/textInfos/offsets.py | 9 ++++----- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index a1a3a5820a3..cbe1d332c91 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -172,6 +172,7 @@ class getTextLengthExStruct(ctypes.Structure): class EditTextInfo(textInfos.offsets.OffsetsTextInfo): # Override segFlags to enforce use of Uniscribe charSegFlag = CharSegFlag.UNISCRIBE + @property def wordSegFlag(self): return WordSegFlag.UNISCRIBE diff --git a/source/displayModel.py b/source/displayModel.py index 5e1674ccd3a..e77f3ac3b33 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -548,6 +548,7 @@ def _getStoryLength(self): # Override segFlags to strictly use the old fallen-back method charSegFlag = CharSegFlag.NONE + @property def wordSegFlag(self): return WordSegFlag.NONE diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 31b427fd6d6..ca60a72a716 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -4,15 +4,12 @@ # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt from abc import abstractmethod -from logging import ERROR import re import ctypes import unicodedata import NVDAHelper import config import config.featureFlagEnums -import documentBase -import logHandler import textInfos import locationHelper from treeInterceptorHandler import TreeInterceptor @@ -22,7 +19,6 @@ from dataclasses import dataclass from typing import ( Optional, - Union, Tuple, Dict, List, @@ -176,6 +172,7 @@ def wordSegFlag(self) -> WordSegFlag | None: return WordSegFlag.CHINESE case _: from logHandler import log + log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") #: The encoding internal to the underlying text info implementation. @@ -498,7 +495,9 @@ def __init__(self, obj, position): Subclasses may extend this to perform implementation specific initialisation, calling their superclass method afterwards. """ super(OffsetsTextInfo, self).__init__(obj, position) - self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"]["wordSegmentationStandard"] + self.wordSegConf: config.featureFlag.FeatureFlag = config.conf["documentNavigation"][ + "wordSegmentationStandard" + ] from NVDAObjects import NVDAObject From 4a680ea5beb3208ded629b899796fb054da160f2 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 24 Aug 2025 23:45:48 +0800 Subject: [PATCH 33/93] make "Auto" the default option for word navigation --- source/config/configSpec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/config/configSpec.py b/source/config/configSpec.py index b15c9ff7c45..110e632e1ae 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -258,7 +258,7 @@ reportClickable = boolean(default=true) [documentNavigation] - wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Uniscribe") + wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") [reviewCursor] From 97b6db7ab9db401c57cd6041eeb717fb49fdbab7 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 24 Aug 2025 23:47:59 +0800 Subject: [PATCH 34/93] update for pyright checks --- source/textInfos/offsets.py | 2 -- source/textUtils/__init__.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index ca60a72a716..aa7e3ccd2a2 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -8,14 +8,12 @@ import ctypes import unicodedata import NVDAHelper -import config import config.featureFlagEnums import textInfos import locationHelper from treeInterceptorHandler import TreeInterceptor import textUtils from textUtils.segFlag import CharSegFlag, WordSegFlag -import config from dataclasses import dataclass from typing import ( Optional, diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 26c44922ee1..bb869b71d85 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -576,6 +576,8 @@ def _choose_strategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case WordSegFlag.CHINESE: return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + case _: + pass def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" From 3e495d2c5cef5ca2d4224087d7da7d557444ad41 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 28 Aug 2025 18:13:11 +0800 Subject: [PATCH 35/93] add wrappers for user dict management --- nvdaHelper/cppjieba/cppjieba.cpp | 12 ++++++++++++ nvdaHelper/cppjieba/cppjieba.def | 3 +++ nvdaHelper/cppjieba/cppjieba.hpp | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index ccd972385e9..3d6af70fd4e 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -71,6 +71,18 @@ int segmentOffsets(const char* text, int** charOffsets, int* outLen) { return 0; } +bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag)); +} + +bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) { + return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag)); +} + +bool find(const char* word) { + return JiebaSingleton::getInstance().Find(string(word)); +} + void freeOffsets(int* ptr) { if (ptr) free(ptr); } diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def index cc2246d0f21..565aef79db0 100644 --- a/nvdaHelper/cppjieba/cppjieba.def +++ b/nvdaHelper/cppjieba/cppjieba.def @@ -2,4 +2,7 @@ LIBRARY cppjieba EXPORTS initJieba segmentOffsets + insertUserWord + deleteUserWord + find freeOffsets diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index c5f0bad4af7..2f3235f917d 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -11,6 +11,7 @@ For full terms and any additional permissions, see the NVDA license file: https: #include #include +#include #include #include #include "Jieba.hpp" @@ -56,6 +57,11 @@ JIEBA_API int initJieba(); /// @return 0 on success, -1 on failure. JIEBA_API int segmentOffsets(const char* text, int** charOffsets, int* outLen); +/// Wrapper for word management +JIEBA_API bool insertUserWord(const char* word, int freq, const char* tag); +JIEBA_API bool deleteUserWord(const char* word, const char* tag); +JIEBA_API bool find(const char* word); + /// @brief Free memory allocated by segmentOffsets. JIEBA_API void freeOffsets(int* ptr); From 3b2d8353de1717844c541db9bed7c14f72ab8311 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 28 Aug 2025 19:11:05 +0800 Subject: [PATCH 36/93] resolve deprecation --- source/textUtils/wordSeg/wordSegStrategy.py | 24 ++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 84750b5b799..ac6d6cd354f 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -5,7 +5,13 @@ import os import ctypes -from ctypes import c_char_p, c_int, POINTER, byref +from ctypes import ( + c_bool, + c_char_p, + c_int, + POINTER, + byref, +) from abc import ABC, abstractmethod from functools import lru_cache from collections.abc import Callable @@ -70,9 +76,9 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative if cls._lib is not None: return try: - from NVDAHelper import versionedLibPath + from NVDAState import ReadPaths - lib_path = os.path.join(versionedLibPath, "cppjieba.dll") + lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll") cls._lib = ctypes.cdll.LoadLibrary(lib_path) # Setup function signatures @@ -84,6 +90,18 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative cls._lib.segmentOffsets.restype = c_int cls._lib.segmentOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + # bool insertUserWord(const char* word, int freq, const char* tag) + cls._lib.insertUserWord.restype = c_bool + cls._lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p] + + # bool deleteUserWord(const char* word, const char* tag) + cls._lib.deleteUserWord.restype = c_bool + cls._lib.deleteUserWord.argtypes = [c_char_p, c_char_p] + + # bool find(const char* word) + cls._lib.find.restype = c_bool + cls._lib.find.argtypes = [c_char_p] + # void freeOffsets(int* offsets) cls._lib.freeOffsets.restype = None cls._lib.freeOffsets.argtypes = [POINTER(c_int)] From 9e6a2e1964a66a07b58f65b084f18ec0f26975d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Aug 2025 11:17:31 +0000 Subject: [PATCH 37/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index ac6d6cd354f..2472080ebdc 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -6,11 +6,11 @@ import os import ctypes from ctypes import ( - c_bool, - c_char_p, - c_int, - POINTER, - byref, + c_bool, + c_char_p, + c_int, + POINTER, + byref, ) from abc import ABC, abstractmethod from functools import lru_cache From 1869ed0022dc12ed1c19471c86167b42deabc871 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 4 Sep 2025 12:41:29 +0800 Subject: [PATCH 38/93] simplify the initialization of cppjieba drop off initialization of the keyword extractor which we don't need --- nvdaHelper/cppjieba/cppjieba.cpp | 2 +- nvdaHelper/cppjieba/cppjieba.hpp | 149 ++++++++++++++++++++++++++++++- nvdaHelper/cppjieba/sconscript | 8 +- source/setup.py | 7 +- 4 files changed, 158 insertions(+), 8 deletions(-) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index 3d6af70fd4e..bed745dd487 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -13,7 +13,7 @@ JiebaSingleton& JiebaSingleton::getInstance() { return instance; } -JiebaSingleton::JiebaSingleton(): cppjieba::Jieba() { } // call base ctor to load dictionaries, models, etc. +JiebaSingleton::JiebaSingleton(): cppjieba::JiebaSegmenter() { } // call base ctor to load dictionaries, models, etc. void JiebaSingleton::getOffsets(const std::string& text, std::vector& charOffsets) { std::lock_guard lock(segMutex); diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index 2f3235f917d..c69f9dd5bb6 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -14,7 +14,152 @@ For full terms and any additional permissions, see the NVDA license file: https: #include #include #include -#include "Jieba.hpp" +#include "QuerySegment.hpp" + +// this code is from Jieba.hpp and modified to drop off its keyword extractor +namespace cppjieba { + +class JiebaSegmenter { + public: + JiebaSegmenter(const string& dict_path = "", + const string& model_path = "", + const string& user_dict_path = "") + : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")), + model_(getPath(model_path, "hmm_model.utf8")), + mp_seg_(&dict_trie_), + hmm_seg_(&model_), + mix_seg_(&dict_trie_, &model_), + full_seg_(&dict_trie_), + query_seg_(&dict_trie_, &model_) { + } + ~JiebaSegmenter() { + } + + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } + + bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_.InsertUserWord(word, tag); + } + + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + return dict_trie_.InsertUserWord(word,freq, tag); + } + + bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_.DeleteUserWord(word, tag); + } + + bool Find(const string& word) + { + return dict_trie_.Find(word); + } + + void ResetSeparators(const string& s) { + //TODO + mp_seg_.ResetSeparators(s); + hmm_seg_.ResetSeparators(s); + mix_seg_.ResetSeparators(s); + full_seg_.ResetSeparators(s); + query_seg_.ResetSeparators(s); + } + + const DictTrie* GetDictTrie() const { + return &dict_trie_; + } + + const HMMModel* GetHMMModel() const { + return &model_; + } + + void LoadUserDict(const vector& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const set& buf) { + dict_trie_.LoadUserDict(buf); + } + + void LoadUserDict(const string& path) { + dict_trie_.LoadUserDict(path); + } + + private: + static string pathJoin(const string& dir, const string& filename) { + if (dir.empty()) { + return filename; + } + + char last_char = dir[dir.length() - 1]; + if (last_char == '/' || last_char == '\\') { + return dir + filename; + } else { + #ifdef _WIN32 + return dir + '\\' + filename; + #else + return dir + '/' + filename; + #endif + } + } + + static string getCurrentDirectory() { + string path(__FILE__); + size_t pos = path.find_last_of("/\\"); + return (pos == string::npos) ? "" : path.substr(0, pos); + } + + static string getPath(const string& path, const string& default_file) { + if (path.empty()) { + string current_dir = getCurrentDirectory(); + string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\")); + string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\")); + string root_dir = grandparent_dir.substr(0, grandparent_dir.find_last_of("/\\")); + return pathJoin(pathJoin(pathJoin(root_dir, "include\\cppjieba"), "dict"), default_file); + } + return path; + } + + DictTrie dict_trie_; + HMMModel model_; + + // They share the same dict trie and model + MPSegment mp_seg_; + HMMSegment hmm_seg_; + MixSegment mix_seg_; + FullSegment full_seg_; + QuerySegment query_seg_; +}; // class Jieba + +} // namespace cppjieba + #ifdef _WIN32 # define JIEBA_API __declspec(dllexport) @@ -25,7 +170,7 @@ For full terms and any additional permissions, see the NVDA license file: https: using namespace std; /// @brief Singleton wrapper around cppjieba::Jieba. -class JiebaSingleton : public cppjieba::Jieba { +class JiebaSingleton : public cppjieba::JiebaSegmenter { public: /// @brief Returns the single instance, constructing on first call. static JiebaSingleton& getInstance(); diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 1a1fd12a2c2..739e357ab51 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -43,10 +43,10 @@ if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDi [ f for f in env.Glob(f"{cppjiebaDictPath}/*") - if f.name - not in ( - "README.md", - "pos_dict", + if f.name in ( + "jieba.dict.utf8", + "user.dict.utf8", + "hmm_model.utf8", ) and not f.name.endswith(".in") ], diff --git a/source/setup.py b/source/setup.py index 3f8a8e36d90..829c628fb3a 100755 --- a/source/setup.py +++ b/source/setup.py @@ -265,7 +265,12 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: ("images", glob("images/*.ico")), ("fonts", glob("fonts/*.ttf")), ("louis/tables", glob("louis/tables/*")), - ("cppjieba/dicts", glob("cppjieba/dicts/*")), + ( + "cppjieba/dicts", + glob("cppjieba/dicts/jieba.dict.utf8") + + glob("cppjieba/dicts/user.dict.utf8") + + glob("cppjieba/dicts/hmm_model.utf8"), + ), ("COMRegistrationFixes", glob("COMRegistrationFixes/*.reg")), ("miscDeps/tools", ["../miscDeps/tools/msgfmt.exe"]), (".", glob("../miscDeps/python/*.dll")), From a1113d80370b967bf68da1d2150c05625debf5aa Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 4 Sep 2025 14:07:24 +0800 Subject: [PATCH 39/93] add `segmentedText` method --- source/textUtils/__init__.py | 7 +- source/textUtils/wordSeg/wordSegStrategy.py | 163 ++++++++++++-------- 2 files changed, 106 insertions(+), 64 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index bb869b71d85..43aa36a5973 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -555,7 +555,7 @@ class WordSegmenter: # Japanese kana (Hiragana U+3040 - U+309F, Katakana U+30A0 - U+30FF) _KANA: re.Pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") - def __init__(self, text: str, encoding: str | None, wordSegFlag: WordSegFlag): + def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag = WordSegFlag.AUTO): self.text: str = text self.encoding: str | None = encoding self.wordSegFlag: WordSegFlag = wordSegFlag @@ -582,7 +582,7 @@ def _choose_strategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" try: - return self.strategy.getSegmentForOffset(self.text, offset) + return self.strategy.getSegmentForOffset(offset) except Exception as e: log.debugWarning( "WordSegmenter.getSegmentForOffset failed: %s text: '%s' offset: %s segmentation strategy: %s", @@ -592,3 +592,6 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: self.strategy, ) return None + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.strategy.segmentedText(sep, newSepIndex) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 2472080ebdc..2ab6720d053 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -58,10 +58,71 @@ def __init__(self, text: str, encoding: str | None = None): self.encoding = encoding @abstractmethod - def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: # TODO: optimize + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: # TODO: optimize """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" pass + @abstractmethod + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segmented result with separators.""" + pass + + +class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): + """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" + + # Copied from OffsetTextInfos. TODO: optimize + def _calculateUniscribeOffsets( + self, + lineText: str, + relOffset: int, + ) -> tuple[int, int] | None: + """ + Calculates the bounds of a unit at an offset within a given string of text + using the Windows uniscribe library, also used in Notepad, for example. + Units supported are character and word. + @param lineText: the text string to analyze + @param relOffset: the character offset within the text string at which to calculate the bounds. + """ + + import NVDAHelper + + helperFunc = NVDAHelper.localLib.calculateWordOffsets + + relStart = ctypes.c_int() + relEnd = ctypes.c_int() + # uniscribe does some strange things + # when you give it a string with not more than two alphanumeric chars in a row. + # Inject two alphanumeric characters at the end to fix this + uniscribeLineText = lineText + "xx" + # We can't rely on len(lineText) to calculate the length of the line. + offsetConverter = textUtils.WideStringOffsetConverter(lineText) + lineLength = offsetConverter.encodedStringLength + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the str based line offsets to wide string offsets. + relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0] + uniscribeLineLength = lineLength + 2 + if helperFunc( + uniscribeLineText, + uniscribeLineLength, + relOffset, + ctypes.byref(relStart), + ctypes.byref(relEnd), + ): + relStart = relStart.value + relEnd = min(lineLength, relEnd.value) + if self.encoding != textUtils.WCHAR_ENCODING: + # We need to convert the uniscribe based offsets to str offsets. + relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) + return (relStart, relEnd) + return None + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + return self._calculateUniscribeOffsets(self.text, offset) + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.text + class ChineseWordSegmentationStrategy(WordSegmentationStrategy): _lib = None @@ -140,79 +201,57 @@ def _callCppjiebaCached(text_utf8: bytes) -> list[int]: return [] @lru_cache(maxsize=128) - def _callCPPJieba(self, text: str) -> list[tuple[int, int]] | None: - data = text.encode("utf-8") + def _callCPPJieba(self) -> list[int] | None: + data = self.text.encode("utf-8") charPtr = POINTER(c_int)() outLen = c_int() result = self._lib.segmentOffsets(data, byref(charPtr), byref(outLen)) if result != 0 or not charPtr: - return [], [] + return n = outLen.value - char_offsets = [charPtr[i] for i in range(n)] + charOffsets = [charPtr[i] for i in range(n)] self._lib.freeOffsets(charPtr) - return char_offsets + return charOffsets + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + """Segments the text using the word end indices.""" + if len(self.wordEndIndex) <= 1: + return self.text + result = "" + for sepIndex in range(len(self.wordEndIndex) - 1): + if sepIndex == 0: + preIndex = 0 + else: + preIndex = self.wordEndIndex[sepIndex - 1] + curIndex = self.wordEndIndex[sepIndex] + postIndex = self.wordEndIndex[sepIndex + 1] + result += self.text[preIndex:curIndex] + if ( + (sepIndex < len(self.wordEndIndex) - 1) + and not (result.endswith(sep)) + and not (self.text[curIndex:postIndex].startswith(sep)) + ): + """The separator needs adding.""" + result += sep + if newSepIndex is not None: + """Track the index of the separators.""" + newSepIndex.append(len(result) - len(sep)) + else: + result += self.text[curIndex:postIndex] + return result - def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: - wordEnds = self._callCPPJieba(text) + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + wordEnds = self._callCPPJieba() if wordEnds is None or not wordEnds: - return None + return index = next((i for i, end in enumerate(wordEnds) if end > offset)) if index == 0: start = 0 else: start = wordEnds[index - 1] - end = wordEnds[index] if index < len(wordEnds) else len(text) + end = wordEnds[index] if index < len(wordEnds) else len(self.text) return (start, end) - -class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): - """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" - - # Copied from OffsetTextInfos. TODO: optimize - def _calculateUniscribeOffsets( - self, - lineText: str, - relOffset: int, - ) -> tuple[int, int] | None: - """ - Calculates the bounds of a unit at an offset within a given string of text - using the Windows uniscribe library, also used in Notepad, for example. - Units supported are character and word. - @param lineText: the text string to analyze - @param relOffset: the character offset within the text string at which to calculate the bounds. - """ - - import NVDAHelper - - helperFunc = NVDAHelper.localLib.calculateWordOffsets - - relStart = ctypes.c_int() - relEnd = ctypes.c_int() - # uniscribe does some strange things - # when you give it a string with not more than two alphanumeric chars in a row. - # Inject two alphanumeric characters at the end to fix this - uniscribeLineText = lineText + "xx" - # We can't rely on len(lineText) to calculate the length of the line. - offsetConverter = textUtils.WideStringOffsetConverter(lineText) - lineLength = offsetConverter.encodedStringLength - if self.encoding != textUtils.WCHAR_ENCODING: - # We need to convert the str based line offsets to wide string offsets. - relOffset = offsetConverter.strToEncodedOffsets(relOffset, relOffset)[0] - uniscribeLineLength = lineLength + 2 - if helperFunc( - uniscribeLineText, - uniscribeLineLength, - relOffset, - ctypes.byref(relStart), - ctypes.byref(relEnd), - ): - relStart = relStart.value - relEnd = min(lineLength, relEnd.value) - if self.encoding != textUtils.WCHAR_ENCODING: - # We need to convert the uniscribe based offsets to str offsets. - relStart, relEnd = offsetConverter.encodedToStrOffsets(relStart, relEnd) - return (relStart, relEnd) - return None - - def getSegmentForOffset(self, text: str, offset: int) -> tuple[int, int] | None: - return self._calculateUniscribeOffsets(text, offset) + def __init__(self, text, encoding=None): + super().__init__(text, encoding) + self.wordEndIndex = self._callCPPJieba() From 2d7c5968e2b0cc70bb4b2078b42f284e9c71a37f Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Fri, 5 Sep 2025 00:09:16 +0800 Subject: [PATCH 40/93] add word separator to optimize braille output for Chinese text * add a subclass of `OffsetConverter` to handle the offset mapping for raw text and separated one * add logic to invoke it when Chinese translation tables are used --- source/braille.py | 11 ++- source/textUtils/wordSeg/wordSegUtils.py | 91 ++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 source/textUtils/wordSeg/wordSegUtils.py diff --git a/source/braille.py b/source/braille.py index bd27ed1a2c8..16e4896b05a 100644 --- a/source/braille.py +++ b/source/braille.py @@ -66,7 +66,7 @@ import brailleViewer from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting from utils.security import objectBelowLockScreenAndWindowsIsLocked, post_sessionLockStateChanged -from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter +from textUtils import isUnicodeNormalized, OffsetConverter, UnicodeNormalizationOffsetConverter import hwIo from editableText import EditableText @@ -568,10 +568,17 @@ def update(self): if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None: mode |= louis.compbrlAtCursor - converter: UnicodeNormalizationOffsetConverter | None = None + converter: OffsetConverter | None = None textToTranslate = self.rawText textToTranslateTypeforms = self.rawTextTypeforms cursorPos = self.cursorPos + if config.conf["braille"]["translationTable"].startswith("zh"): + from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401 + + converter = WordSegWithSeparatorOffsetConverter(textToTranslate) + textToTranslate = converter.encoded + if cursorPos is not None: + cursorPos = converter.strToEncodedOffsets(cursorPos) if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(textToTranslate): converter = UnicodeNormalizationOffsetConverter(textToTranslate) textToTranslate = converter.encoded diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py new file mode 100644 index 00000000000..d32f09bae96 --- /dev/null +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -0,0 +1,91 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from functools import cached_property +from textUtils import OffsetConverter, WordSegmenter + + +class WordSegWithSeparatorOffsetConverter(OffsetConverter): + """An offset converter for text with word segmentation separator.""" + + sep: str = " " + computedStrToEncodedOffsets: list[int] + computedEncodedToStrOffsets: list[int] + + def __init__(self, text: str): + super().__init__(text) + self.newSepIndex: list[int] = [] + self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex) + self.computedStrToEncodedOffsets = list(range(self.strLength)) + for i in range(len(self.computedStrToEncodedOffsets)): + self.computedStrToEncodedOffsets[i] += self._relevantStrToEncodedOffsets[i] + self.computedEncodedToStrOffsets = list(range(self.encodedStringLength)) + for j in range(len(self.computedEncodedToStrOffsets)): + self.computedEncodedToStrOffsets[j] += self._relevantEncodedToStrOffsets[j] + + @property + def _relevantStrToEncodedOffsets(self) -> list[int]: + relevantIndex: list[int] = [0 for _ in range(self.strLength)] + j = 0 + m = len(self.newSepIndex) + for i in range(self.strLength): + while j < m and self.newSepIndex[j] <= i + j: + j += 1 + relevantIndex[i] = j + return relevantIndex + + @property + def _relevantEncodedToStrOffsets(self) -> list[int]: + relevantIndex: list[int] = [0 for _ in range(self.encodedStringLength)] + j = 0 + m = len(self.newSepIndex) + for i in range(self.encodedStringLength): + while j < m and self.newSepIndex[j] < i + j: + j += 1 + relevantIndex[i] = -j + return relevantIndex + + @cached_property + def encodedStringLength(self) -> int: + """Returns the length of the string in its subclass-specific encoded representation.""" + return len(self.encoded) + + def strToEncodedOffsets( + self, + strStart: int, + strEnd: int | None = None, + raiseOnError: bool = False, + ) -> int | tuple[int, int]: + super().strToEncodedOffsets(strStart, strEnd, raiseOnError) + if strStart == 0: + resultStart = 0 + else: + resultStart = self.computedStrToEncodedOffsets[strStart] + if strEnd is None: + return resultStart + elif strStart == strEnd: + return (resultStart, resultStart) + else: + resultEnd = self.computedStrToEncodedOffsets[strEnd] + return (resultStart, resultEnd) + + def encodedToStrOffsets( + self, + encodedStart: int, + encodedEnd: int | None = None, + raiseOnError: bool = False, + ) -> int | tuple[int]: + super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) + if encodedStart == 0: + resultStart = 0 + else: + resultStart = self.computedEncodedToStrOffsets[encodedStart] + if encodedEnd is None: + return resultStart + elif encodedStart == encodedEnd: + return (resultStart, resultStart) + else: + resultEnd = self.computedEncodedToStrOffsets[encodedEnd] + return (resultStart, resultEnd) From 49cc1fe9b95d39a8c77b987ffdc1882d5767206d Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 6 Sep 2025 18:31:05 +0800 Subject: [PATCH 41/93] update cppjieba to the latest commit --- include/cppjieba | 2 +- projectDocs/dev/createDevEnvironment.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cppjieba b/include/cppjieba index 9b40903ed6c..9408c1d08fa 160000 --- a/include/cppjieba +++ b/include/cppjieba @@ -1 +1 @@ -Subproject commit 9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714 +Subproject commit 9408c1d08facc6e324dc90260e8cb20ecceebf70 diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md index 13d6cf0aace..4981fe6db77 100644 --- a/projectDocs/dev/createDevEnvironment.md +++ b/projectDocs/dev/createDevEnvironment.md @@ -100,7 +100,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch * [Java Access Bridge 32 bit, from Zulu Community OpenJDK build 17.0.9+8Zulu (17.46.19)](https://github.com/nvaccess/javaAccessBridge32-bin) * We are in the process of switching to: Java Access Bridge 64 bit, from Zulu Community OpenJDK build 17.0.16+8Zulu (17.60.17) * [Windows Implementation Libraries (WIL)](https://github.com/microsoft/wil/) -* [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9b40903ed6cbd795367ea64f9a7d3f3bc4aa4714` +* [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9408c1d08facc6e324dc90260e8cb20ecceebf70` #### Build time dependencies From a9281f623ddcb7334fbabb5ec6d4130aba490993 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 6 Sep 2025 18:36:20 +0800 Subject: [PATCH 42/93] update .gitattributes for .hpp header files --- .gitattributes | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitattributes b/.gitattributes index a27b741841f..27936ed7319 100644 --- a/.gitattributes +++ b/.gitattributes @@ -50,6 +50,7 @@ sconstruct text diff=python *.c text diff=c *.cpp text diff=cpp *.h text diff=c +*.hpp text diff=cpp *.idl text diff=c *.acf text diff=c From 2955ca8a24d62aee3028ddce710d2f8536c07f0e Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 12:28:14 +0800 Subject: [PATCH 43/93] simplify helper of `cppjieba` * turn to build-in `Word` structure * remove some items we don't use --- nvdaHelper/cppjieba/cppjieba.cpp | 28 +++--------- nvdaHelper/cppjieba/cppjieba.hpp | 76 +++++++------------------------- 2 files changed, 22 insertions(+), 82 deletions(-) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index bed745dd487..c45286d1a68 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -15,27 +15,13 @@ JiebaSingleton& JiebaSingleton::getInstance() { JiebaSingleton::JiebaSingleton(): cppjieba::JiebaSegmenter() { } // call base ctor to load dictionaries, models, etc. -void JiebaSingleton::getOffsets(const std::string& text, std::vector& charOffsets) { +void JiebaSingleton::getOffsets(const std::string& text, std::vector& wordEndOffsets) { std::lock_guard lock(segMutex); - std::vector words; + std::vector words; this->Cut(text, words, true); - int cumulative = 0; - for (auto const& w : words) { - int wc = 0; - auto ptr = reinterpret_cast(w.c_str()); - size_t i = 0, len = w.size(); - while (i < len) { - unsigned char c = ptr[i]; - if ((c & 0x80) == 0) i += 1; - else if ((c & 0xE0) == 0xC0) i += 2; - else if ((c & 0xF0) == 0xE0) i += 3; - else if ((c & 0xF8) == 0xF0) i += 4; - else i += 1; - ++wc; - } - cumulative += wc; - charOffsets.push_back(cumulative); + for (auto const& word : words) { + wordEndOffsets.push_back(word.unicode_offset); } } @@ -51,8 +37,8 @@ int initJieba() { } } -int segmentOffsets(const char* text, int** charOffsets, int* outLen) { - if (!text || !charOffsets || !outLen) return -1; +int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen) { + if (!text || !wordEndOffsets || !outLen) return -1; // we assume initJieba() has already been called successfully std::string input(text); @@ -66,7 +52,7 @@ int segmentOffsets(const char* text, int** charOffsets, int* outLen) { return -1; } for (int i = 0; i < n; ++i) buf[i] = offs[i]; - *charOffsets = buf; + *wordEndOffsets = buf; *outLen = n; return 0; } diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index c69f9dd5bb6..900d60b734c 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -16,8 +16,9 @@ For full terms and any additional permissions, see the NVDA license file: https: #include #include "QuerySegment.hpp" -// this code is from Jieba.hpp and modified to drop off its keyword extractor -namespace cppjieba { +using namespace std; + +namespace cppjieba { // copied from Jieba.hpp and modified to drop off its keyword extractor we don't use class JiebaSegmenter { public: @@ -26,49 +27,14 @@ class JiebaSegmenter { const string& user_dict_path = "") : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")), model_(getPath(model_path, "hmm_model.utf8")), - mp_seg_(&dict_trie_), - hmm_seg_(&model_), - mix_seg_(&dict_trie_, &model_), - full_seg_(&dict_trie_), - query_seg_(&dict_trie_, &model_) { + mix_seg_(&dict_trie_, &model_) { } ~JiebaSegmenter() { } - void Cut(const string& sentence, vector& words, bool hmm = true) const { - mix_seg_.Cut(sentence, words, hmm); - } void Cut(const string& sentence, vector& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } - void CutAll(const string& sentence, vector& words) const { - full_seg_.Cut(sentence, words); - } - void CutAll(const string& sentence, vector& words) const { - full_seg_.Cut(sentence, words); - } - void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { - query_seg_.Cut(sentence, words, hmm); - } - void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { - query_seg_.Cut(sentence, words, hmm); - } - void CutHMM(const string& sentence, vector& words) const { - hmm_seg_.Cut(sentence, words); - } - void CutHMM(const string& sentence, vector& words) const { - hmm_seg_.Cut(sentence, words); - } - void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { - mp_seg_.Cut(sentence, words, max_word_len); - } - void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { - mp_seg_.Cut(sentence, words, max_word_len); - } - - bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { - return dict_trie_.InsertUserWord(word, tag); - } bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { return dict_trie_.InsertUserWord(word,freq, tag); @@ -84,12 +50,7 @@ class JiebaSegmenter { } void ResetSeparators(const string& s) { - //TODO - mp_seg_.ResetSeparators(s); - hmm_seg_.ResetSeparators(s); mix_seg_.ResetSeparators(s); - full_seg_.ResetSeparators(s); - query_seg_.ResetSeparators(s); } const DictTrie* GetDictTrie() const { @@ -150,35 +111,22 @@ class JiebaSegmenter { DictTrie dict_trie_; HMMModel model_; - // They share the same dict trie and model - MPSegment mp_seg_; - HMMSegment hmm_seg_; MixSegment mix_seg_; - FullSegment full_seg_; - QuerySegment query_seg_; -}; // class Jieba +}; // class JiebaSegmenter } // namespace cppjieba -#ifdef _WIN32 -# define JIEBA_API __declspec(dllexport) -#else -# define JIEBA_API -#endif - -using namespace std; - /// @brief Singleton wrapper around cppjieba::Jieba. class JiebaSingleton : public cppjieba::JiebaSegmenter { public: /// @brief Returns the single instance, constructing on first call. static JiebaSingleton& getInstance(); - /// @brief Do thread-safe segmentation and compute character end offsets. + /// @brief Do thread-safe segmentation and compute word end offsets. /// @param text The input text in UTF-8 encoding. - /// @param charOffsets Output vector to hold character offsets. - void getOffsets(const string& text, vector& charOffsets); + /// @param wordEndOffsets Output vector to hold word offsets. + void getOffsets(const string& text, vector& wordEndOffsets); private: JiebaSingleton(); ///< private ctor initializes base Jieba @@ -192,6 +140,12 @@ class JiebaSingleton : public cppjieba::JiebaSegmenter { std::mutex segMutex; ///< guards concurrent Cut() calls }; +#ifdef _WIN32 +# define JIEBA_API __declspec(dllexport) +#else +# define JIEBA_API +#endif + extern "C" { /// @brief Force singleton construction (load dicts, etc.) before any segmentation. @@ -200,7 +154,7 @@ JIEBA_API int initJieba(); /// @brief Segment UTF-8 text into character offsets. /// @return 0 on success, -1 on failure. -JIEBA_API int segmentOffsets(const char* text, int** charOffsets, int* outLen); +JIEBA_API int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen); /// Wrapper for word management JIEBA_API bool insertUserWord(const char* word, int freq, const char* tag); From b848e1ba10728ed58805c481225b088f705cea4e Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 16:28:01 +0800 Subject: [PATCH 44/93] update `wordSegStrategy.py` --- source/textUtils/wordSeg/wordSegStrategy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 2ab6720d053..c81ade7b761 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -17,8 +17,9 @@ from collections.abc import Callable from typing import Any -from logHandler import log import textUtils +from logHandler import log + # Initializer registry (robust: saves module + qualname + original function + args/kwargs) # Each entry: (module_name: str, qualname: str, func_obj: Callable, args: tuple, kwargs: dict) @@ -147,7 +148,7 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative cls._lib.initJieba.restype = c_int cls._lib.initJieba.argtypes = [] - # int segmentOffsets(const char* utf8Text, int** outOffsets, int* outLen) + #int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen); cls._lib.segmentOffsets.restype = c_int cls._lib.segmentOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] From 3bfbe59d4c310b75010362f5f4f2936bfadf0d47 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 17:04:33 +0800 Subject: [PATCH 45/93] update module importing order and type annotations --- source/NVDAObjects/window/edit.py | 12 +++-------- source/displayModel.py | 29 +++++++++++++++------------ source/gui/settingsDialogs.py | 33 ++++++++++++++----------------- source/textInfos/offsets.py | 19 +++++++----------- source/textUtils/__init__.py | 28 +++++++++++++------------- source/textUtils/segFlag.py | 1 + 6 files changed, 56 insertions(+), 66 deletions(-) diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index 1961ccdd258..9004d97a774 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -3,12 +3,6 @@ # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt -from typing import ( - Dict, - Optional, - Union, -) - import ctypes from comtypes import BSTR, COMError import colors @@ -381,7 +375,7 @@ def _getFormatFieldAndOffsets(self, offset, formatConfig, calculateOffsets=True) def _setFormatFieldColor( self, - charFormat: Union[CharFormat2AStruct, CharFormat2WStruct], + charFormat: CharFormat2AStruct | CharFormat2WStruct, formatField: textInfos.FormatField, ) -> None: if charFormat.dwEffects & CFE_AUTOCOLOR: @@ -662,7 +656,7 @@ def _getParagraphOffsets(self, offset): comInterfaces.tom.tomStory: textInfos.UNIT_STORY, } -NVDAUnitsToITextDocumentUnits: Dict[str, int] = { +NVDAUnitsToITextDocumentUnits: dict[str, int] = { textInfos.UNIT_CHARACTER: comInterfaces.tom.tomCharacter, textInfos.UNIT_WORD: comInterfaces.tom.tomWord, textInfos.UNIT_LINE: comInterfaces.tom.tomLine, @@ -915,7 +909,7 @@ def __init__(self, obj, position, _rangeObj=None): else: raise NotImplementedError("position: %s" % position) - def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: if not formatConfig: formatConfig = config.conf["documentFormatting"] textRange = self._rangeObj.duplicate diff --git a/source/displayModel.py b/source/displayModel.py index 941f11a6234..e13e5c432e0 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -3,9 +3,14 @@ # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt -from ctypes import byref, c_short, c_long +from ctypes import ( + byref, + c_short, + c_long, +) import unicodedata import math + from NVDAHelper import localLib import colors import XMLFormatting @@ -22,14 +27,12 @@ import windowUtils from locationHelper import RectLTRB, RectLTWH import textUtils -from textUtils.segFlag import CharSegFlag, WordSegFlag -from typing import ( - List, - Tuple, - Optional, - Dict, +from textUtils.segFlag import ( + CharSegFlag, + WordSegFlag, ) + #: A text info unit constant for a single chunk in a display model UNIT_DISPLAYCHUNK = "displayChunk" @@ -385,11 +388,11 @@ def __init__(self, obj, position, limitRect=None): def _get__storyFieldsAndRects( self, - ) -> Tuple[ - List[textInfos.TextInfo.TextOrFieldsT], - List[RectLTRB], - List[int], - List[int], + ) -> tuple[ + list[textInfos.TextInfo.TextOrFieldsT], + list[RectLTRB], + list[int], + list[int], ]: # All returned coordinates are logical coordinates. if self._location: @@ -536,7 +539,7 @@ def wordSegFlag(self): def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) - def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: start = self._startOffset end = self._endOffset if start == end: diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index d4e83f06e78..4dd59ce06ab 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -69,9 +69,6 @@ from typing import ( Any, Callable, - List, - Optional, - Set, ) import core import keyboardHandler @@ -197,7 +194,7 @@ def __init__( hasApplyButton: bool = False, settingsSizerOrientation: int = wx.VERTICAL, multiInstanceAllowed: bool = False, - buttons: Set[int] = {wx.OK, wx.CANCEL}, + buttons: set[int] = {wx.OK, wx.CANCEL}, ): """ @param parent: The parent for this dialog; C{None} for no parent. @@ -436,7 +433,7 @@ def _validationErrorMessageBox( self, message: str, option: str, - category: Optional[str] = None, + category: str | None = None, ): if category is None: category = self.title @@ -505,7 +502,7 @@ class MultiCategorySettingsDialog(SettingsDialog): """ title = "" - categoryClasses: typing.List[typing.Type[SettingsPanel]] = [] + categoryClasses: list[type[SettingsPanel]] = [] class CategoryUnavailableError(RuntimeError): pass @@ -1268,7 +1265,7 @@ class SynthesizerSelectionDialog(SettingsDialog): # Translators: This is the label for the synthesizer selection dialog title = _("Select Synthesizer") helpId = "SynthesizerSelection" - synthNames: List[str] = [] + synthNames: list[str] = [] def makeSettings(self, settingsSizer): settingsSizerHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) @@ -5146,7 +5143,7 @@ def _onModeChange(self, evt: wx.CommandEvent): def showStartErrorForProviders( parent: wx.Window, - providers: List[vision.providerInfo.ProviderInfo], + providers: list[vision.providerInfo.ProviderInfo], ) -> None: if not providers: return @@ -5176,7 +5173,7 @@ def showStartErrorForProviders( def showTerminationErrorForProviders( parent: wx.Window, - providers: List[vision.providerInfo.ProviderInfo], + providers: list[vision.providerInfo.ProviderInfo], ) -> None: if not providers: return @@ -5222,7 +5219,7 @@ def __init__( def getProviderInfo(self) -> vision.providerInfo.ProviderInfo: return self._providerInfo - def getProviderInstance(self) -> Optional[vision.providerBase.VisionEnhancementProvider]: + def getProviderInstance(self) -> vision.providerBase.VisionEnhancementProvider | None: return vision.handler.getProviderInstance(self._providerInfo) def startProvider( @@ -5286,8 +5283,8 @@ def _doTerminate(self) -> bool: class VisionSettingsPanel(SettingsPanel): settingsSizerHelper: guiHelper.BoxSizerHelper - providerPanelInstances: List[SettingsPanel] - initialProviders: List[vision.providerInfo.ProviderInfo] + providerPanelInstances: list[SettingsPanel] + initialProviders: list[vision.providerInfo.ProviderInfo] # Translators: This is the label for the vision panel title = _("Vision") helpId = "VisionSettings" @@ -5298,7 +5295,7 @@ class VisionSettingsPanel(SettingsPanel): def _createProviderSettingsPanel( self, providerInfo: vision.providerInfo.ProviderInfo, - ) -> Optional[SettingsPanel]: + ) -> SettingsPanel | None: settingsPanelCls = providerInfo.providerClass.getSettingsPanelClass() if not settingsPanelCls: if gui._isDebug(): @@ -5343,12 +5340,12 @@ def makeSettings(self, settingsSizer: wx.BoxSizer): def safeInitProviders( self, - providers: List[vision.providerInfo.ProviderInfo], + providers: list[vision.providerInfo.ProviderInfo], ) -> None: """Initializes one or more providers in a way that is gui friendly, showing an error if appropriate. """ - errorProviders: List[vision.providerInfo.ProviderInfo] = [] + errorProviders: list[vision.providerInfo.ProviderInfo] = [] for provider in providers: success = VisionProviderStateControl(self, provider).startProvider(shouldPromptOnError=False) if not success: @@ -5357,14 +5354,14 @@ def safeInitProviders( def safeTerminateProviders( self, - providers: List[vision.providerInfo.ProviderInfo], + providers: list[vision.providerInfo.ProviderInfo], verbose: bool = False, ) -> None: """Terminates one or more providers in a way that is gui friendly, @verbose: Whether to show a termination error. @returns: Whether termination succeeded for all providers. """ - errorProviders: List[vision.providerInfo.ProviderInfo] = [] + errorProviders: list[vision.providerInfo.ProviderInfo] = [] for provider in providers: success = VisionProviderStateControl(self, provider).terminateProvider(shouldPromptOnError=False) if not success: @@ -5457,7 +5454,7 @@ def __init__( providerControl: VisionProviderStateControl, ): self._providerControl = providerControl - self._providerSettings: Optional[VisionProviderSubPanel_Settings] = None + self._providerSettings: VisionProviderSubPanel_Settings | None = None self._providerSettingsSizer = wx.BoxSizer(orient=wx.VERTICAL) super().__init__(parent=parent) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index f663dd2daf6..37c0a9cab34 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -7,6 +7,9 @@ import re import ctypes import unicodedata +from dataclasses import dataclass +from typing import Self + import NVDAHelper import config.featureFlagEnums import textInfos @@ -14,14 +17,6 @@ from treeInterceptorHandler import TreeInterceptor import textUtils from textUtils.segFlag import CharSegFlag, WordSegFlag -from dataclasses import dataclass -from typing import ( - Optional, - Tuple, - Dict, - List, - Self, -) from logHandler import log @@ -174,7 +169,7 @@ def wordSegFlag(self) -> WordSegFlag | None: log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") #: The encoding internal to the underlying text info implementation. - encoding: Optional[str] = textUtils.WCHAR_ENCODING + encoding: str | None = textUtils.WCHAR_ENCODING def __eq__(self, other): if self is other or ( @@ -209,7 +204,7 @@ def _get_locationText(self): # C901 '_get_boundingRects' is too complex # Note: when working on _get_boundingRects, look for opportunities to simplify # and move logic out into smaller helper functions. - def _get_boundingRects(self) -> List[locationHelper.RectLTWH]: # noqa: C901 + def _get_boundingRects(self) -> list[locationHelper.RectLTWH]: # noqa: C901 if self.isCollapsed: return [] startOffset = self._startOffset @@ -342,7 +337,7 @@ def _calculateUniscribeOffsets( lineText: str, unit: str, relOffset: int, - ) -> Optional[Tuple[int, int]]: + ) -> tuple[int, int] | None: """ Calculates the bounds of a unit at an offset within a given string of text using the Windows uniscribe library, also used in Notepad, for example. @@ -623,7 +618,7 @@ def setEndPoint(self, other, which): else: self._startOffset = self._endOffset - def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: if not formatConfig: formatConfig = config.conf["documentFormatting"] if self.detectFormattingAfterCursorMaybeSlow and not formatConfig["detectFormatAfterCursor"]: diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 43aa36a5973..45c4e0c2dfc 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -15,7 +15,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty from functools import cached_property -from typing import Generator, Optional, Tuple, Type +from typing import Generator from logHandler import log @@ -55,7 +55,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: """ This method takes two offsets from the str representation of the string the object is initialized with, and converts them to subclass-specific encoded string offsets. @@ -84,7 +84,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: r""" This method takes two offsets from subclass-specific encoded string representation of the string the object is initialized with, and converts them to str offsets. @@ -140,7 +140,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: """ This method takes two offsets from the str representation of the string the object is initialized with, and converts them to wide character string offsets. @@ -177,7 +177,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int, raiseOnError: bool = False, - ) -> Tuple[int, int]: + ) -> tuple[int, int]: r""" This method takes two offsets from the wide character representation of the string the object is initialized with, and converts them to str offsets. @@ -245,7 +245,7 @@ def encodedToStrOffsets( def getTextFromRawBytes( buf: bytes, numChars: int, - encoding: Optional[str] = None, + encoding: str | None = None, errorsFallback: str = "replace", ): """ @@ -347,7 +347,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strStart == 0: resultStart = 0 @@ -366,7 +366,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: r""" This method takes two offsets from UTF-8 representation of the string the object is initialized with, and converts them to str offsets. @@ -407,7 +407,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strEnd is None: return strStart @@ -418,7 +418,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int, int]: + ) -> int | tuple[int, int]: super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) if encodedEnd is None: return encodedStart @@ -486,7 +486,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int]: + ) -> int | tuple[int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strStart == 0: resultStart = 0 @@ -505,7 +505,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | Tuple[int]: + ) -> int | tuple[int]: super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) if encodedStart == 0: resultStart = 0 @@ -530,7 +530,7 @@ def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALI return unicodedata.normalize(normalizationForm, text) -ENCODINGS_TO_CONVERTERS: dict[str, Type[OffsetConverter]] = { +ENCODINGS_TO_CONVERTERS: dict[str, type[OffsetConverter]] = { WCHAR_ENCODING: WideStringOffsetConverter, UTF8_ENCODING: UTF8OffsetConverter, "utf_32_le": IdentityOffsetConverter, @@ -539,7 +539,7 @@ def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALI } -def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: +def getOffsetConverter(encoding: str) -> type[OffsetConverter]: try: return ENCODINGS_TO_CONVERTERS[encoding] except IndexError as e: diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 72153c80e18..c15a26c471d 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -5,6 +5,7 @@ from enum import IntFlag + # shared bit masks (explicit powers of two) _AUTO: int = 1 << 0 _UNISCRIBE: int = 1 << 1 From 53158b6b53dc3338f0ea2d9283a972956995c8f8 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 18:17:50 +0800 Subject: [PATCH 46/93] update changelog --- user_docs/en/changes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 15b90af52ff..0e024cca85b 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -48,6 +48,7 @@ Add-ons will need to be re-tested and have their manifest updated. * Licensecheck has been updated to 2025.1 (#18728, @bramd) * X64 NVDAHelper libraries are now also build for the [ARM64EC architecture](https://learn.microsoft.com/en-us/windows/arm/arm64ec). On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of their X64 equivalents. (#18570, @leonarddeR) +* Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for Chinese word segmentation. (#18548, @CrazySteve0605) #### API Breaking Changes @@ -194,7 +195,6 @@ Please refer to [the developer guide](https://download.nvaccess.org/documentatio * The `nvda_dmp` utility has been removed. (#18480, @codeofdusk) * `comInterfaces_sconscript` has been updated to make the generated files in `comInterfaces` work better with IDEs. (#17608, @gexgd0419) * NVDA now configures `wx.lib.agw.persist.PersistenceManager` on GUI initialisation. (#18601) -* Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for word segmentation. (#18548, @CrazySteve0605) #### Deprecations From 30120f8a78178555e83b83eba71da7693a383c5c Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 18:38:36 +0800 Subject: [PATCH 47/93] update building script --- nvdaHelper/cppjieba/sconscript | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 739e357ab51..70b7afab992 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -41,15 +41,13 @@ if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDi env.Install( outDir.Dir("dicts"), [ - f - for f in env.Glob(f"{cppjiebaDictPath}/*") - if f.name in ( + env.File(os.path.join(cppjiebaDictPath, name)) + for name in ( "jieba.dict.utf8", "user.dict.utf8", "hmm_model.utf8", ) - and not f.name.endswith(".in") - ], + ] ) Return("cppjiebaLib") From 00796fef5d5af3ea3fd2411da0645e7e1e0f00e2 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 18:39:59 +0800 Subject: [PATCH 48/93] revert installing script --- source/setup.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/source/setup.py b/source/setup.py index 829c628fb3a..3f8a8e36d90 100755 --- a/source/setup.py +++ b/source/setup.py @@ -265,12 +265,7 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: ("images", glob("images/*.ico")), ("fonts", glob("fonts/*.ttf")), ("louis/tables", glob("louis/tables/*")), - ( - "cppjieba/dicts", - glob("cppjieba/dicts/jieba.dict.utf8") - + glob("cppjieba/dicts/user.dict.utf8") - + glob("cppjieba/dicts/hmm_model.utf8"), - ), + ("cppjieba/dicts", glob("cppjieba/dicts/*")), ("COMRegistrationFixes", glob("COMRegistrationFixes/*.reg")), ("miscDeps/tools", ["../miscDeps/tools/msgfmt.exe"]), (".", glob("../miscDeps/python/*.dll")), From 09b18901707eddbd581dffcc1a4a6ea4cedf0e51 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 7 Sep 2025 19:04:29 +0800 Subject: [PATCH 49/93] fix building script --- nvdaHelper/cppjieba/sconscript | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index 70b7afab992..a6d61dd39c5 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -41,7 +41,7 @@ if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDi env.Install( outDir.Dir("dicts"), [ - env.File(os.path.join(cppjiebaDictPath, name)) + env.Dir(cppjiebaDictPath).File(name) for name in ( "jieba.dict.utf8", "user.dict.utf8", From b3e08ee882b66f1fa9ea0b5dd32d5e87c70dbc16 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 10 Sep 2025 00:23:09 +0800 Subject: [PATCH 50/93] update helper of `coojieba` --- nvdaHelper/cppjieba/cppjieba.cpp | 17 +++++++++-------- nvdaHelper/cppjieba/cppjieba.def | 2 +- nvdaHelper/cppjieba/cppjieba.hpp | 8 ++++---- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index c45286d1a68..233e6d3ee0c 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -7,6 +7,7 @@ For full terms and any additional permissions, see the NVDA license file: https: #include "cppjieba.hpp" + JiebaSingleton& JiebaSingleton::getInstance() { // C++11 guarantees thread-safe init of this local static static JiebaSingleton instance; @@ -15,13 +16,13 @@ JiebaSingleton& JiebaSingleton::getInstance() { JiebaSingleton::JiebaSingleton(): cppjieba::JiebaSegmenter() { } // call base ctor to load dictionaries, models, etc. -void JiebaSingleton::getOffsets(const std::string& text, std::vector& wordEndOffsets) { +void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets) { std::lock_guard lock(segMutex); std::vector words; this->Cut(text, words, true); for (auto const& word : words) { - wordEndOffsets.push_back(word.unicode_offset); + wordEndOffsets.push_back(word.unicode_offset + word.unicode_length); } } @@ -37,24 +38,24 @@ int initJieba() { } } -int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen) { - if (!text || !wordEndOffsets || !outLen) return -1; +bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) { + if (!text || !wordEndOffsets || !outLen) return false; // we assume initJieba() has already been called successfully - std::string input(text); + std::string textStr(text); std::vector offs; - JiebaSingleton::getInstance().getOffsets(input, offs); + JiebaSingleton::getInstance().getWordEndOffsets(textStr, offs); int n = static_cast(offs.size()); int* buf = static_cast(std::malloc(sizeof(int) * n)); if (!buf) { *outLen = 0; - return -1; + return false; } for (int i = 0; i < n; ++i) buf[i] = offs[i]; *wordEndOffsets = buf; *outLen = n; - return 0; + return true; } bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) { diff --git a/nvdaHelper/cppjieba/cppjieba.def b/nvdaHelper/cppjieba/cppjieba.def index 565aef79db0..fca4a152027 100644 --- a/nvdaHelper/cppjieba/cppjieba.def +++ b/nvdaHelper/cppjieba/cppjieba.def @@ -1,7 +1,7 @@ LIBRARY cppjieba EXPORTS initJieba - segmentOffsets + calculateWordOffsets insertUserWord deleteUserWord find diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index 900d60b734c..2bb98cd70b5 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -125,8 +125,8 @@ class JiebaSingleton : public cppjieba::JiebaSegmenter { /// @brief Do thread-safe segmentation and compute word end offsets. /// @param text The input text in UTF-8 encoding. - /// @param wordEndOffsets Output vector to hold word offsets. - void getOffsets(const string& text, vector& wordEndOffsets); + /// @param wordEndOffsets Output vector to hold byte offsets of word ends. + void getWordEndOffsets(const string& text, vector& wordEndOffsets); private: JiebaSingleton(); ///< private ctor initializes base Jieba @@ -154,14 +154,14 @@ JIEBA_API int initJieba(); /// @brief Segment UTF-8 text into character offsets. /// @return 0 on success, -1 on failure. -JIEBA_API int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen); +JIEBA_API bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen); /// Wrapper for word management JIEBA_API bool insertUserWord(const char* word, int freq, const char* tag); JIEBA_API bool deleteUserWord(const char* word, const char* tag); JIEBA_API bool find(const char* word); -/// @brief Free memory allocated by segmentOffsets. +/// @brief Free memory allocated by calculateWordOffsets. JIEBA_API void freeOffsets(int* ptr); } // extern "C" From 3a0badc72412e5526155e2e840a4ee6830bdf52d Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Wed, 10 Sep 2025 00:37:33 +0800 Subject: [PATCH 51/93] update `wordSegStrategy.py` * add LRU caching --- source/textUtils/wordSeg/wordSegStrategy.py | 83 +++++++++++++-------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index c81ade7b761..9fca4c9a1dc 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -148,9 +148,9 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative cls._lib.initJieba.restype = c_int cls._lib.initJieba.argtypes = [] - #int segmentOffsets(const char* text, int** wordEndOffsets, int* outLen); - cls._lib.segmentOffsets.restype = c_int - cls._lib.segmentOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) + cls._lib.calculateWordOffsets.restype = c_bool + cls._lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] # bool insertUserWord(const char* word, int freq, const char* tag) cls._lib.insertUserWord.restype = c_bool @@ -173,46 +173,69 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative log.debugWarning("Failed to load cppjieba library: %s", e) cls._lib = None - @staticmethod @lru_cache(maxsize=256) - def _callCppjiebaCached(text_utf8: bytes) -> list[int]: - """Module-level cached wrapper to call the C library given utf8 bytes.""" - if ChineseWordSegmentationStrategy._lib is None: - return [] - lib = ChineseWordSegmentationStrategy._lib + def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: + if self._lib is None: + return None + charPtr = POINTER(c_int)() outLen = c_int(0) + try: - res = lib.segmentOffsets(text_utf8, byref(charPtr), byref(outLen)) - if res != 0 or not bool(charPtr): - return [] - n = outLen.value - # read n ints - offsets = [charPtr[i] for i in range(n)] - # free memory allocated by C side - lib.freeOffsets(charPtr) - return offsets + success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + offsets = [charPtr[i] for i in range(n)] + return offsets + finally: + self._lib.freeOffsets(charPtr) except Exception as e: log.debugWarning("Exception calling cppjieba: %s", e) try: if bool(charPtr): - lib.freeOffsets(charPtr) + self._lib.freeOffsets(charPtr) except Exception: pass - return [] + return None + - @lru_cache(maxsize=128) def _callCPPJieba(self) -> list[int] | None: + """ + Instance method: encode self.text and call cppjieba. + Returns list[int] on success, None on failure. + Uses LRU cache keyed by utf-8 bytes. + """ data = self.text.encode("utf-8") - charPtr = POINTER(c_int)() - outLen = c_int() - result = self._lib.segmentOffsets(data, byref(charPtr), byref(outLen)) - if result != 0 or not charPtr: - return - n = outLen.value - charOffsets = [charPtr[i] for i in range(n)] - self._lib.freeOffsets(charPtr) - return charOffsets + + if getattr(self, "_lib", None) is ChineseWordSegmentationStrategy._lib: + return self._callCppjiebaCached(data) + else: + if self._lib is None: + return None + + charPtr = POINTER(c_int)() + outLen = c_int(0) + try: + success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen)) + if not success or not bool(charPtr) or outLen.value <= 0: + return None + + try: + n = outLen.value + return [charPtr[i] for i in range(n)] + finally: + self._lib.freeOffsets(charPtr) + except Exception as e: + log.debugWarning("Exception calling cppjieba: %s", e) + try: + if bool(charPtr): + self._lib.freeOffsets(charPtr) + except Exception: + pass + return None def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: """Segments the text using the word end indices.""" From cf3e11501d325c373f8cdd3823c8611f986cbacc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 16:49:47 +0000 Subject: [PATCH 52/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 9fca4c9a1dc..a1188e16b46 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -201,7 +201,6 @@ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: pass return None - def _callCPPJieba(self) -> list[int] | None: """ Instance method: encode self.text and call cppjieba. From 97eb6dd43a846ea066f07d71e298d62d08763982 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 13 Sep 2025 09:19:40 +0800 Subject: [PATCH 53/93] handle punctuation spacing --- source/textUtils/wordSeg/wordSegStrategy.py | 61 +++++++++++++++++---- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index a1188e16b46..867d1ea8d96 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -236,31 +236,70 @@ def _callCPPJieba(self) -> list[int] | None: pass return None + # Punctuation that should NOT have a separator BEFORE it (no space before these marks) + NO_SEP_BEFORE = { + # Common Chinese fullwidth punctuation + "。", ",", "、", ";", ":", "?", "!", "…", "...", "—", "–", "——", + ")", "】", "》", "〉", "」", "』", "”", "’", + "%", "‰", "¥", + + # Common ASCII / halfwidth punctuation + ".", ",", ";", ":", "?", "!", "%", ".", ")", + "]", "}", ">", "\"", "'" + } + + # Punctuation that should NOT have a separator AFTER it (no space after these marks) + NO_SEP_AFTER = { + # Common Chinese fullwidth opening/leading punctuation + "(", "【", "《", "〈", "「", "『", "“", "‘", + + # Common ASCII / halfwidth opening/leading punctuation + "(", "[", "{", "<", "\"", "'", + + # Currency and prefix-like symbols that typically bind to the following token + "$", "€", "£", "¥", "₹", + + # Social/identifier prefixes + "@", "#", "&" + } + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: """Segments the text using the word end indices.""" if len(self.wordEndIndex) <= 1: return self.text + result = "" for sepIndex in range(len(self.wordEndIndex) - 1): - if sepIndex == 0: - preIndex = 0 - else: - preIndex = self.wordEndIndex[sepIndex - 1] + preIndex = 0 if sepIndex == 0 else self.wordEndIndex[sepIndex - 1] curIndex = self.wordEndIndex[sepIndex] postIndex = self.wordEndIndex[sepIndex + 1] + + # append the token before the potential separator position result += self.text[preIndex:curIndex] - if ( - (sepIndex < len(self.wordEndIndex) - 1) - and not (result.endswith(sep)) - and not (self.text[curIndex:postIndex].startswith(sep)) - ): - """The separator needs adding.""" + + + # quick checks: avoid adding duplicate separator if already present + if result.endswith(sep) or self.text[curIndex:postIndex].startswith(sep): + # separator already present at either side -> skip adding + continue + + # slice to check the next token (text between curIndex and postIndex) + nextSlice = self.text[curIndex:postIndex] + + # Determine whether any punctuation forbids a separator BEFORE the next token + noSepBefore = any(nextSlice.startswith(s) for s in self.NO_SEP_BEFORE) + # Determine whether any punctuation forbids a separator AFTER the current result + noSepAfter = any(result.endswith(s) for s in self.NO_SEP_AFTER) + + if not (noSepBefore or noSepAfter): + # If neither side forbids the separator, add it result += sep if newSepIndex is not None: - """Track the index of the separators.""" newSepIndex.append(len(result) - len(sep)) else: + # append the final trailing token after the loop result += self.text[curIndex:postIndex] + return result def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: From bac32106a93ce6a6e9cd50f8c77230359ee1e41b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 13 Sep 2025 01:33:24 +0000 Subject: [PATCH 54/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 73 +++++++++++++++++---- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 867d1ea8d96..a69b5bcc82e 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -239,28 +239,74 @@ def _callCPPJieba(self) -> list[int] | None: # Punctuation that should NOT have a separator BEFORE it (no space before these marks) NO_SEP_BEFORE = { # Common Chinese fullwidth punctuation - "。", ",", "、", ";", ":", "?", "!", "…", "...", "—", "–", "——", - ")", "】", "》", "〉", "」", "』", "”", "’", - "%", "‰", "¥", - + "。", + ",", + "、", + ";", + ":", + "?", + "!", + "…", + "...", + "—", + "–", + "——", + ")", + "】", + "》", + "〉", + "」", + "』", + "”", + "’", + "%", + "‰", + "¥", # Common ASCII / halfwidth punctuation - ".", ",", ";", ":", "?", "!", "%", ".", ")", - "]", "}", ">", "\"", "'" + ".", + ",", + ";", + ":", + "?", + "!", + "%", + ".", + ")", + "]", + "}", + ">", + '"', + "'", } # Punctuation that should NOT have a separator AFTER it (no space after these marks) NO_SEP_AFTER = { # Common Chinese fullwidth opening/leading punctuation - "(", "【", "《", "〈", "「", "『", "“", "‘", - + "(", + "【", + "《", + "〈", + "「", + "『", + "“", + "‘", # Common ASCII / halfwidth opening/leading punctuation - "(", "[", "{", "<", "\"", "'", - + "(", + "[", + "{", + "<", + '"', + "'", # Currency and prefix-like symbols that typically bind to the following token - "$", "€", "£", "¥", "₹", - + "$", + "€", + "£", + "¥", + "₹", # Social/identifier prefixes - "@", "#", "&" + "@", + "#", + "&", } def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: @@ -277,7 +323,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> # append the token before the potential separator position result += self.text[preIndex:curIndex] - # quick checks: avoid adding duplicate separator if already present if result.endswith(sep) or self.text[curIndex:postIndex].startswith(sep): # separator already present at either side -> skip adding From c2cbb2409c7a82b2578b195ce35d7dc603d11263 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 20 Sep 2025 23:53:56 +0800 Subject: [PATCH 55/93] Revert "Update projectDocs/dev/createDevEnvironment.md" This reverts commit 06070c12f7261ddac8dbd223e6d561950cad500d. --- projectDocs/dev/createDevEnvironment.md | 1 + 1 file changed, 1 insertion(+) diff --git a/projectDocs/dev/createDevEnvironment.md b/projectDocs/dev/createDevEnvironment.md index 4981fe6db77..1e15e98df69 100644 --- a/projectDocs/dev/createDevEnvironment.md +++ b/projectDocs/dev/createDevEnvironment.md @@ -100,6 +100,7 @@ If you aren't sure, run `git submodule update` after every git pull, merge or ch * [Java Access Bridge 32 bit, from Zulu Community OpenJDK build 17.0.9+8Zulu (17.46.19)](https://github.com/nvaccess/javaAccessBridge32-bin) * We are in the process of switching to: Java Access Bridge 64 bit, from Zulu Community OpenJDK build 17.0.16+8Zulu (17.60.17) * [Windows Implementation Libraries (WIL)](https://github.com/microsoft/wil/) +* [NVDA DiffMatchPatch](https://github.com/codeofdusk/nvda_dmp) * [cppjieba - Chinese word segmentation](https://github.com/yanyiwu/cppjieba), commit `9408c1d08facc6e324dc90260e8cb20ecceebf70` #### Build time dependencies From 194a69ead2fee07fb53ea56fc556c0733dc25b67 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 00:10:40 +0800 Subject: [PATCH 56/93] avoid using compilation time path --- nvdaHelper/cppjieba/cppjieba.cpp | 91 ++++++++++++++++++++++++-------- nvdaHelper/cppjieba/cppjieba.hpp | 50 ++++++++---------- 2 files changed, 92 insertions(+), 49 deletions(-) diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index 233e6d3ee0c..d63aa1f130c 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -8,54 +8,101 @@ For full terms and any additional permissions, see the NVDA license file: https: #include "cppjieba.hpp" +using namespace std; + +// static members for singleton bookkeeping +JiebaSingleton* JiebaSingleton::instance = nullptr; +std::once_flag JiebaSingleton::initFlag; + +JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) { + // convert incoming C-string+length to std::string (handles dictDir == nullptr) + std::string dir = dictDir; + + // ensure singleton is constructed exactly once + std::call_once(JiebaSingleton::initFlag, [&]() { + // allocate on heap, so we avoid copy/move and control lifetime + JiebaSingleton::instance = new JiebaSingleton(dir.c_str()); + // optional: register deleter at exit + std::atexit([]() { + delete JiebaSingleton::instance; + JiebaSingleton::instance = nullptr; + }); + }); + + // after call_once, instance must be non-null + return *JiebaSingleton::instance; +} + JiebaSingleton& JiebaSingleton::getInstance() { - // C++11 guarantees thread-safe init of this local static - static JiebaSingleton instance; - return instance; + if (!JiebaSingleton::instance) { + throw std::runtime_error("JiebaSingleton::getInstance() called before initialization. Call getInstance(dictDir) or initJieba() first."); + } + return *JiebaSingleton::instance; } -JiebaSingleton::JiebaSingleton(): cppjieba::JiebaSegmenter() { } // call base ctor to load dictionaries, models, etc. +JiebaSingleton::JiebaSingleton(const char* dictDir) +: cppjieba::JiebaSegmenter( + std::string(dictDir), + std::string(dictDir), + std::string(dictDir) + ) +{ + // base class ctor will load dictionaries/models +} void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets) { std::lock_guard lock(segMutex); + wordEndOffsets.clear(); std::vector words; this->Cut(text, words, true); - for (auto const& word : words) { + for (const auto& word : words) { wordEndOffsets.push_back(word.unicode_offset + word.unicode_length); } } extern "C" { -int initJieba() { +bool initJieba(const char* dictDir) { try { // simply force the singleton into existence - (void)JiebaSingleton::getInstance(); - return 0; + (void)JiebaSingleton::getInstance(dictDir); + return true; } catch (...) { - return -1; + return false; } } bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) { if (!text || !wordEndOffsets || !outLen) return false; - // we assume initJieba() has already been called successfully - - std::string textStr(text); - std::vector offs; - JiebaSingleton::getInstance().getWordEndOffsets(textStr, offs); - int n = static_cast(offs.size()); - int* buf = static_cast(std::malloc(sizeof(int) * n)); - if (!buf) { + try { + std::string textStr(text); + std::vector offs; + JiebaSingleton::getInstance().getWordEndOffsets(textStr, offs); + + int n = static_cast(offs.size()); + if (n == 0) { + *wordEndOffsets = nullptr; + *outLen = 0; + return true; // success, but no offsets + } + + int* buf = static_cast(std::malloc(sizeof(int) * n)); + if (!buf) { + *wordEndOffsets = nullptr; + *outLen = 0; + return false; + } + for (int i = 0; i < n; ++i) buf[i] = offs[i]; + *wordEndOffsets = buf; + *outLen = n; + return true; + } catch (...) { + *wordEndOffsets = nullptr; *outLen = 0; return false; } - for (int i = 0; i < n; ++i) buf[i] = offs[i]; - *wordEndOffsets = buf; - *outLen = n; - return true; } bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) { @@ -71,7 +118,7 @@ bool find(const char* word) { } void freeOffsets(int* ptr) { - if (ptr) free(ptr); + if (ptr) std::free(ptr); } } // extern "C" diff --git a/nvdaHelper/cppjieba/cppjieba.hpp b/nvdaHelper/cppjieba/cppjieba.hpp index 2bb98cd70b5..13ccf47acc6 100644 --- a/nvdaHelper/cppjieba/cppjieba.hpp +++ b/nvdaHelper/cppjieba/cppjieba.hpp @@ -14,6 +14,8 @@ For full terms and any additional permissions, see the NVDA license file: https: #include #include #include +#include +#include #include "QuerySegment.hpp" using namespace std; @@ -22,11 +24,11 @@ namespace cppjieba { // copied from Jieba.hpp and modified to drop off its keyw class JiebaSegmenter { public: - JiebaSegmenter(const string& dict_path = "", - const string& model_path = "", - const string& user_dict_path = "") - : dict_trie_(getPath(dict_path, "jieba.dict.utf8"), getPath(user_dict_path, "user.dict.utf8")), - model_(getPath(model_path, "hmm_model.utf8")), + JiebaSegmenter(const string& dict_path, + const string& model_path, + const string& user_dict_path) + : dict_trie_(pathJoin(dict_path, "jieba.dict.utf8"), pathJoin(user_dict_path, "user.dict.utf8")), + model_(pathJoin(model_path, "hmm_model.utf8")), mix_seg_(&dict_trie_, &model_) { } ~JiebaSegmenter() { @@ -97,17 +99,6 @@ class JiebaSegmenter { return (pos == string::npos) ? "" : path.substr(0, pos); } - static string getPath(const string& path, const string& default_file) { - if (path.empty()) { - string current_dir = getCurrentDirectory(); - string parent_dir = current_dir.substr(0, current_dir.find_last_of("/\\")); - string grandparent_dir = parent_dir.substr(0, parent_dir.find_last_of("/\\")); - string root_dir = grandparent_dir.substr(0, grandparent_dir.find_last_of("/\\")); - return pathJoin(pathJoin(pathJoin(root_dir, "include\\cppjieba"), "dict"), default_file); - } - return path; - } - DictTrie dict_trie_; HMMModel model_; @@ -121,21 +112,27 @@ class JiebaSegmenter { class JiebaSingleton : public cppjieba::JiebaSegmenter { public: /// @brief Returns the single instance, constructing on first call. + static JiebaSingleton& getInstance(const char* dictDir); + static JiebaSingleton& getInstance(); /// @brief Do thread-safe segmentation and compute word end offsets. - /// @param text The input text in UTF-8 encoding. - /// @param wordEndOffsets Output vector to hold byte offsets of word ends. - void getWordEndOffsets(const string& text, vector& wordEndOffsets); + /// @param text The input text in UTF-8 encoding. + /// @param wordEndOffsets Output vector to hold byte offsets of word ends. + void getWordEndOffsets(const std::string& text, std::vector& wordEndOffsets); + + // singleton bookkeeping + static JiebaSingleton* instance; + static std::once_flag initFlag; private: - JiebaSingleton(); ///< private ctor initializes base Jieba + JiebaSingleton(const char* dictDir); ///< private ctor initializes base Jieba - /// Disable copy and move - JiebaSingleton(const JiebaSingleton&) = delete; - JiebaSingleton& operator = (const JiebaSingleton&) = delete; - JiebaSingleton(JiebaSingleton&&) = delete; - JiebaSingleton& operator = (JiebaSingleton&&) = delete; + /// Disable copy and move + JiebaSingleton(const JiebaSingleton&) = delete; + JiebaSingleton& operator = (const JiebaSingleton&) = delete; + JiebaSingleton(JiebaSingleton&&) = delete; + JiebaSingleton& operator = (JiebaSingleton&&) = delete; std::mutex segMutex; ///< guards concurrent Cut() calls }; @@ -149,8 +146,7 @@ class JiebaSingleton : public cppjieba::JiebaSegmenter { extern "C" { /// @brief Force singleton construction (load dicts, etc.) before any segmentation. -/// @return 0 on success, -1 on failure. -JIEBA_API int initJieba(); +JIEBA_API bool initJieba(const char* dictDir); /// @brief Segment UTF-8 text into character offsets. /// @return 0 on success, -1 on failure. From 2e730d6a1c1aa7945efddeff37358ebf9536f9f2 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 00:20:19 +0800 Subject: [PATCH 57/93] Update .gitattributes Co-authored-by: Sean Budd --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 27936ed7319..5e783cc057e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -50,7 +50,7 @@ sconstruct text diff=python *.c text diff=c *.cpp text diff=cpp *.h text diff=c -*.hpp text diff=cpp +*.hpp text diff=cpp *.idl text diff=c *.acf text diff=c From a8955a3b6dae94eba41139fa5836acae81ae8a78 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 22:19:19 +0800 Subject: [PATCH 58/93] Revert "update module importing order and type annotations" This reverts commit 3bfbe59d4c310b75010362f5f4f2936bfadf0d47. --- source/NVDAObjects/window/edit.py | 12 ++++++++--- source/displayModel.py | 29 ++++++++++++--------------- source/gui/settingsDialogs.py | 33 +++++++++++++++++-------------- source/textInfos/offsets.py | 19 +++++++++++------- source/textUtils/__init__.py | 28 +++++++++++++------------- source/textUtils/segFlag.py | 1 - 6 files changed, 66 insertions(+), 56 deletions(-) diff --git a/source/NVDAObjects/window/edit.py b/source/NVDAObjects/window/edit.py index 9004d97a774..1961ccdd258 100644 --- a/source/NVDAObjects/window/edit.py +++ b/source/NVDAObjects/window/edit.py @@ -3,6 +3,12 @@ # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +from typing import ( + Dict, + Optional, + Union, +) + import ctypes from comtypes import BSTR, COMError import colors @@ -375,7 +381,7 @@ def _getFormatFieldAndOffsets(self, offset, formatConfig, calculateOffsets=True) def _setFormatFieldColor( self, - charFormat: CharFormat2AStruct | CharFormat2WStruct, + charFormat: Union[CharFormat2AStruct, CharFormat2WStruct], formatField: textInfos.FormatField, ) -> None: if charFormat.dwEffects & CFE_AUTOCOLOR: @@ -656,7 +662,7 @@ def _getParagraphOffsets(self, offset): comInterfaces.tom.tomStory: textInfos.UNIT_STORY, } -NVDAUnitsToITextDocumentUnits: dict[str, int] = { +NVDAUnitsToITextDocumentUnits: Dict[str, int] = { textInfos.UNIT_CHARACTER: comInterfaces.tom.tomCharacter, textInfos.UNIT_WORD: comInterfaces.tom.tomWord, textInfos.UNIT_LINE: comInterfaces.tom.tomLine, @@ -909,7 +915,7 @@ def __init__(self, obj, position, _rangeObj=None): else: raise NotImplementedError("position: %s" % position) - def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: if not formatConfig: formatConfig = config.conf["documentFormatting"] textRange = self._rangeObj.duplicate diff --git a/source/displayModel.py b/source/displayModel.py index e13e5c432e0..941f11a6234 100644 --- a/source/displayModel.py +++ b/source/displayModel.py @@ -3,14 +3,9 @@ # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt -from ctypes import ( - byref, - c_short, - c_long, -) +from ctypes import byref, c_short, c_long import unicodedata import math - from NVDAHelper import localLib import colors import XMLFormatting @@ -27,12 +22,14 @@ import windowUtils from locationHelper import RectLTRB, RectLTWH import textUtils -from textUtils.segFlag import ( - CharSegFlag, - WordSegFlag, +from textUtils.segFlag import CharSegFlag, WordSegFlag +from typing import ( + List, + Tuple, + Optional, + Dict, ) - #: A text info unit constant for a single chunk in a display model UNIT_DISPLAYCHUNK = "displayChunk" @@ -388,11 +385,11 @@ def __init__(self, obj, position, limitRect=None): def _get__storyFieldsAndRects( self, - ) -> tuple[ - list[textInfos.TextInfo.TextOrFieldsT], - list[RectLTRB], - list[int], - list[int], + ) -> Tuple[ + List[textInfos.TextInfo.TextOrFieldsT], + List[RectLTRB], + List[int], + List[int], ]: # All returned coordinates are logical coordinates. if self._location: @@ -539,7 +536,7 @@ def wordSegFlag(self): def _getTextRange(self, start, end): return "".join(x for x in self._getFieldsInRange(start, end) if isinstance(x, str)) - def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: start = self._startOffset end = self._endOffset if start == end: diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index 4dd59ce06ab..d4e83f06e78 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -69,6 +69,9 @@ from typing import ( Any, Callable, + List, + Optional, + Set, ) import core import keyboardHandler @@ -194,7 +197,7 @@ def __init__( hasApplyButton: bool = False, settingsSizerOrientation: int = wx.VERTICAL, multiInstanceAllowed: bool = False, - buttons: set[int] = {wx.OK, wx.CANCEL}, + buttons: Set[int] = {wx.OK, wx.CANCEL}, ): """ @param parent: The parent for this dialog; C{None} for no parent. @@ -433,7 +436,7 @@ def _validationErrorMessageBox( self, message: str, option: str, - category: str | None = None, + category: Optional[str] = None, ): if category is None: category = self.title @@ -502,7 +505,7 @@ class MultiCategorySettingsDialog(SettingsDialog): """ title = "" - categoryClasses: list[type[SettingsPanel]] = [] + categoryClasses: typing.List[typing.Type[SettingsPanel]] = [] class CategoryUnavailableError(RuntimeError): pass @@ -1265,7 +1268,7 @@ class SynthesizerSelectionDialog(SettingsDialog): # Translators: This is the label for the synthesizer selection dialog title = _("Select Synthesizer") helpId = "SynthesizerSelection" - synthNames: list[str] = [] + synthNames: List[str] = [] def makeSettings(self, settingsSizer): settingsSizerHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) @@ -5143,7 +5146,7 @@ def _onModeChange(self, evt: wx.CommandEvent): def showStartErrorForProviders( parent: wx.Window, - providers: list[vision.providerInfo.ProviderInfo], + providers: List[vision.providerInfo.ProviderInfo], ) -> None: if not providers: return @@ -5173,7 +5176,7 @@ def showStartErrorForProviders( def showTerminationErrorForProviders( parent: wx.Window, - providers: list[vision.providerInfo.ProviderInfo], + providers: List[vision.providerInfo.ProviderInfo], ) -> None: if not providers: return @@ -5219,7 +5222,7 @@ def __init__( def getProviderInfo(self) -> vision.providerInfo.ProviderInfo: return self._providerInfo - def getProviderInstance(self) -> vision.providerBase.VisionEnhancementProvider | None: + def getProviderInstance(self) -> Optional[vision.providerBase.VisionEnhancementProvider]: return vision.handler.getProviderInstance(self._providerInfo) def startProvider( @@ -5283,8 +5286,8 @@ def _doTerminate(self) -> bool: class VisionSettingsPanel(SettingsPanel): settingsSizerHelper: guiHelper.BoxSizerHelper - providerPanelInstances: list[SettingsPanel] - initialProviders: list[vision.providerInfo.ProviderInfo] + providerPanelInstances: List[SettingsPanel] + initialProviders: List[vision.providerInfo.ProviderInfo] # Translators: This is the label for the vision panel title = _("Vision") helpId = "VisionSettings" @@ -5295,7 +5298,7 @@ class VisionSettingsPanel(SettingsPanel): def _createProviderSettingsPanel( self, providerInfo: vision.providerInfo.ProviderInfo, - ) -> SettingsPanel | None: + ) -> Optional[SettingsPanel]: settingsPanelCls = providerInfo.providerClass.getSettingsPanelClass() if not settingsPanelCls: if gui._isDebug(): @@ -5340,12 +5343,12 @@ def makeSettings(self, settingsSizer: wx.BoxSizer): def safeInitProviders( self, - providers: list[vision.providerInfo.ProviderInfo], + providers: List[vision.providerInfo.ProviderInfo], ) -> None: """Initializes one or more providers in a way that is gui friendly, showing an error if appropriate. """ - errorProviders: list[vision.providerInfo.ProviderInfo] = [] + errorProviders: List[vision.providerInfo.ProviderInfo] = [] for provider in providers: success = VisionProviderStateControl(self, provider).startProvider(shouldPromptOnError=False) if not success: @@ -5354,14 +5357,14 @@ def safeInitProviders( def safeTerminateProviders( self, - providers: list[vision.providerInfo.ProviderInfo], + providers: List[vision.providerInfo.ProviderInfo], verbose: bool = False, ) -> None: """Terminates one or more providers in a way that is gui friendly, @verbose: Whether to show a termination error. @returns: Whether termination succeeded for all providers. """ - errorProviders: list[vision.providerInfo.ProviderInfo] = [] + errorProviders: List[vision.providerInfo.ProviderInfo] = [] for provider in providers: success = VisionProviderStateControl(self, provider).terminateProvider(shouldPromptOnError=False) if not success: @@ -5454,7 +5457,7 @@ def __init__( providerControl: VisionProviderStateControl, ): self._providerControl = providerControl - self._providerSettings: VisionProviderSubPanel_Settings | None = None + self._providerSettings: Optional[VisionProviderSubPanel_Settings] = None self._providerSettingsSizer = wx.BoxSizer(orient=wx.VERTICAL) super().__init__(parent=parent) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 37c0a9cab34..f663dd2daf6 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -7,9 +7,6 @@ import re import ctypes import unicodedata -from dataclasses import dataclass -from typing import Self - import NVDAHelper import config.featureFlagEnums import textInfos @@ -17,6 +14,14 @@ from treeInterceptorHandler import TreeInterceptor import textUtils from textUtils.segFlag import CharSegFlag, WordSegFlag +from dataclasses import dataclass +from typing import ( + Optional, + Tuple, + Dict, + List, + Self, +) from logHandler import log @@ -169,7 +174,7 @@ def wordSegFlag(self) -> WordSegFlag | None: log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") #: The encoding internal to the underlying text info implementation. - encoding: str | None = textUtils.WCHAR_ENCODING + encoding: Optional[str] = textUtils.WCHAR_ENCODING def __eq__(self, other): if self is other or ( @@ -204,7 +209,7 @@ def _get_locationText(self): # C901 '_get_boundingRects' is too complex # Note: when working on _get_boundingRects, look for opportunities to simplify # and move logic out into smaller helper functions. - def _get_boundingRects(self) -> list[locationHelper.RectLTWH]: # noqa: C901 + def _get_boundingRects(self) -> List[locationHelper.RectLTWH]: # noqa: C901 if self.isCollapsed: return [] startOffset = self._startOffset @@ -337,7 +342,7 @@ def _calculateUniscribeOffsets( lineText: str, unit: str, relOffset: int, - ) -> tuple[int, int] | None: + ) -> Optional[Tuple[int, int]]: """ Calculates the bounds of a unit at an offset within a given string of text using the Windows uniscribe library, also used in Notepad, for example. @@ -618,7 +623,7 @@ def setEndPoint(self, other, which): else: self._startOffset = self._endOffset - def getTextWithFields(self, formatConfig: dict | None = None) -> textInfos.TextInfo.TextWithFieldsT: + def getTextWithFields(self, formatConfig: Optional[Dict] = None) -> textInfos.TextInfo.TextWithFieldsT: if not formatConfig: formatConfig = config.conf["documentFormatting"] if self.detectFormattingAfterCursorMaybeSlow and not formatConfig["detectFormatAfterCursor"]: diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 45c4e0c2dfc..43aa36a5973 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -15,7 +15,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty from functools import cached_property -from typing import Generator +from typing import Generator, Optional, Tuple, Type from logHandler import log @@ -55,7 +55,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: """ This method takes two offsets from the str representation of the string the object is initialized with, and converts them to subclass-specific encoded string offsets. @@ -84,7 +84,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: r""" This method takes two offsets from subclass-specific encoded string representation of the string the object is initialized with, and converts them to str offsets. @@ -140,7 +140,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: """ This method takes two offsets from the str representation of the string the object is initialized with, and converts them to wide character string offsets. @@ -177,7 +177,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int, raiseOnError: bool = False, - ) -> tuple[int, int]: + ) -> Tuple[int, int]: r""" This method takes two offsets from the wide character representation of the string the object is initialized with, and converts them to str offsets. @@ -245,7 +245,7 @@ def encodedToStrOffsets( def getTextFromRawBytes( buf: bytes, numChars: int, - encoding: str | None = None, + encoding: Optional[str] = None, errorsFallback: str = "replace", ): """ @@ -347,7 +347,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strStart == 0: resultStart = 0 @@ -366,7 +366,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: r""" This method takes two offsets from UTF-8 representation of the string the object is initialized with, and converts them to str offsets. @@ -407,7 +407,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strEnd is None: return strStart @@ -418,7 +418,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int, int]: + ) -> int | Tuple[int, int]: super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) if encodedEnd is None: return encodedStart @@ -486,7 +486,7 @@ def strToEncodedOffsets( strStart: int, strEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int]: + ) -> int | Tuple[int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) if strStart == 0: resultStart = 0 @@ -505,7 +505,7 @@ def encodedToStrOffsets( encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int]: + ) -> int | Tuple[int]: super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) if encodedStart == 0: resultStart = 0 @@ -530,7 +530,7 @@ def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALI return unicodedata.normalize(normalizationForm, text) -ENCODINGS_TO_CONVERTERS: dict[str, type[OffsetConverter]] = { +ENCODINGS_TO_CONVERTERS: dict[str, Type[OffsetConverter]] = { WCHAR_ENCODING: WideStringOffsetConverter, UTF8_ENCODING: UTF8OffsetConverter, "utf_32_le": IdentityOffsetConverter, @@ -539,7 +539,7 @@ def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALI } -def getOffsetConverter(encoding: str) -> type[OffsetConverter]: +def getOffsetConverter(encoding: str) -> Type[OffsetConverter]: try: return ENCODINGS_TO_CONVERTERS[encoding] except IndexError as e: diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index c15a26c471d..72153c80e18 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -5,7 +5,6 @@ from enum import IntFlag - # shared bit masks (explicit powers of two) _AUTO: int = 1 << 0 _UNISCRIBE: int = 1 << 1 From 90660ba364bf8da28a32cb6d46bbb5c198e154d5 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 22:24:18 +0800 Subject: [PATCH 59/93] update `wordSegStrategy.py` --- source/textUtils/wordSeg/wordSegStrategy.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index a69b5bcc82e..f52dce8e7bf 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -9,6 +9,7 @@ c_bool, c_char_p, c_int, + create_string_buffer, POINTER, byref, ) @@ -144,9 +145,9 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative cls._lib = ctypes.cdll.LoadLibrary(lib_path) # Setup function signatures - # int initJieba() - cls._lib.initJieba.restype = c_int - cls._lib.initJieba.argtypes = [] + # bool initJieba(const char* dictDir) + cls._lib.initJieba.restype = c_bool + cls._lib.initJieba.argtypes = [c_char_p] # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) cls._lib.calculateWordOffsets.restype = c_bool @@ -168,7 +169,13 @@ def _ensureLibLoaded(cls): # TODO: make cppjieba alternative cls._lib.freeOffsets.restype = None cls._lib.freeOffsets.argtypes = [POINTER(c_int)] - cls._lib.initJieba() + # Initialize with dictionary path + import globalVars + + DICTS_DIR = os.path.join(globalVars.appDir, "cppjieba", "dicts") + DICTS_DIR_BYTES = DICTS_DIR.encode("utf-8") + dictDir = create_string_buffer(DICTS_DIR_BYTES) + cls._lib.initJieba(dictDir) except Exception as e: log.debugWarning("Failed to load cppjieba library: %s", e) cls._lib = None From 9537999a0e710075d5035b3cfcfbade90c253bfc Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 22:52:10 +0800 Subject: [PATCH 60/93] revert copyright header of `configSpec.py` --- source/config/configSpec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 75b2b4bfabe..2c168149ec8 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -3,8 +3,8 @@ # Joseph Lee, Dawid Pieper, mltony, Bram Duvigneau, Cyrille Bougot, Rob Meredith, # Burman's Computer and Education Ltd., Leonard de Ruijter, Łukasz Golonka, Cary-rowen, # Wang Chong -# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. -# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +# This file is covered by the GNU General Public License. +# See the file COPYING for more details. from io import StringIO from configobj import ConfigObj From dc23346c23952b8dfd0b8f575143dc83dc81c7de Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 21 Sep 2025 22:58:31 +0800 Subject: [PATCH 61/93] Update source/core.py Co-authored-by: Cyrille Bougot --- source/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/core.py b/source/core.py index 99225b83c27..f6758017471 100644 --- a/source/core.py +++ b/source/core.py @@ -914,7 +914,7 @@ def main(): wordSeg.initialize() except RuntimeError: log.warning("Word segmentation module disabled in configuration") - except: # noqa: E722 + except Exception: log.error("Error initializing word segmentation module", exc_info=True) if globalVars.appArgs.install or globalVars.appArgs.installSilent: From ccf07f9f6709cde57bba3a86d2731aa9bf0e0bad Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 25 Sep 2025 22:31:47 +0800 Subject: [PATCH 62/93] correct method naming --- source/textUtils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 43aa36a5973..d5f07e9d872 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -559,9 +559,9 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag self.text: str = text self.encoding: str | None = encoding self.wordSegFlag: WordSegFlag = wordSegFlag - self.strategy: wordSegStrategy.WordSegmentationStrategy = self._choose_strategy() + self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy() - def _choose_strategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize + def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.AUTO: if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( From 250e7007eb94fd5bbaff68f7260c05b7ecbbea31 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 25 Sep 2025 22:32:03 +0800 Subject: [PATCH 63/93] update UI text for Uniscribe --- source/config/featureFlagEnums.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 5b6b1914a21..59c78bef409 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -148,7 +148,7 @@ def _displayStringLabels(self): # Translators: Label for a method of word segmentation. self.AUTO: _("Auto"), # Translators: Label for a method of word segmentation. - self.UNISCRIBE: _("Uniscribe"), + self.UNISCRIBE: _("Standard"), # Translators: Label for a method of word segmentation. self.CHINESE: _("Chinese"), } From 53b38706d1be6f9f19dbcfa9d66927a6eddd087e Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Thu, 25 Sep 2025 23:09:55 +0800 Subject: [PATCH 64/93] make `cppjieba` only available when NVDA's language is set to Chinese --- source/textUtils/__init__.py | 12 +++++++++--- source/textUtils/wordSeg/wordSegStrategy.py | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index d5f07e9d872..059165533f0 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -567,7 +567,10 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( self.text, ) and not WordSegmenter._KANA.search(self.text): - return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + if wordSegStrategy.ChineseWordSegmentationStrategy._lib: + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: @@ -575,9 +578,12 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: case WordSegFlag.UNISCRIBE: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case WordSegFlag.CHINESE: - return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + if wordSegStrategy.ChineseWordSegmentationStrategy._lib: + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + else: + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case _: - pass + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index f52dce8e7bf..f8969c72e61 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -131,12 +131,14 @@ class ChineseWordSegmentationStrategy(WordSegmentationStrategy): @classmethod @initializerRegistry - def _ensureLibLoaded(cls): # TODO: make cppjieba alternative + def _initCppJieba(cls): # TODO: make cppjieba alternative """ Class-level initializer: attempts to load the versioned cppjieba library and set up ctypes signatures. """ - if cls._lib is not None: + import config + + if not config.conf["general"]["language"].startswith("zh") or cls._lib is not None: return try: from NVDAState import ReadPaths From 111a24d1152aec36c43bbb97567f2f62a9160d1d Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Fri, 26 Sep 2025 20:50:44 +0800 Subject: [PATCH 65/93] update `wordSegSegmenter.py` to handle offsets at the end of the string --- source/textUtils/wordSeg/wordSegStrategy.py | 24 +++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index f8969c72e61..4141d53c81a 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -69,6 +69,17 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> """Segmented result with separators.""" pass + def getWordOffsetRange( + self, + offset: int, + ) -> tuple[int, int] | None: + """Helper to get word offset range from a list of word end offsets.""" + if not self.wordEnds: + return None + index = next((i for i, end in enumerate(self.wordEnds) if end > offset), len(self.wordEnds) - 1) + start = 0 if index == 0 else self.wordEnds[index - 1] + end = self.wordEnds[index] + return (start, end) class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" @@ -357,17 +368,8 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> return result def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: - wordEnds = self._callCPPJieba() - if wordEnds is None or not wordEnds: - return - index = next((i for i, end in enumerate(wordEnds) if end > offset)) - if index == 0: - start = 0 - else: - start = wordEnds[index - 1] - end = wordEnds[index] if index < len(wordEnds) else len(self.text) - return (start, end) + return self.getWordOffsetRange(offset) def __init__(self, text, encoding=None): super().__init__(text, encoding) - self.wordEndIndex = self._callCPPJieba() + self.wordEnds = self._callCPPJieba() From 43bfe0362fd65f70b6824f488cedf3be00453408 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 27 Sep 2025 08:39:14 +0800 Subject: [PATCH 66/93] make initialization of word segmenters conditional on language --- source/config/configSpec.py | 1 + source/config/featureFlagEnums.py | 16 ++++++++++++++++ source/gui/settingsDialogs.py | 6 ++++++ source/textUtils/wordSeg/wordSegStrategy.py | 15 ++++++++++++++- 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 2c168149ec8..b6eaa2d24d2 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -262,6 +262,7 @@ reportClickable = boolean(default=true) [documentNavigation] + initWordSegForUnusedLang = boolean(default=false) wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 59c78bef409..45ed5a5622e 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -138,6 +138,22 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: FontFormattingBrailleModeFlag.TAGS: _("Tags"), } +class InitWordSegForUnusedLnagFlag(DisplayStringEnum): + """Boolean flag for whether to initialize the word segmenters for all languages, even if they are not used.""" + + @property + def _displayStringLabels(self): + return { + # Translators: Label for an option in NVDA settings. + self.DISABLED: _("Disabled"), + # Translators: Label for an option in NVDA settings. + self.ENABLED: _("Enabled"), + } + + DEFAULT = enum.auto() + DISABLED = enum.auto() + ENABLED = enum.auto() + class WordNavigationUnitFlag(DisplayStringEnum): """Enumeration for word navigation.""" diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index d4e83f06e78..ed0f91404a5 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3092,6 +3092,11 @@ class DocumentNavigationPanel(SettingsPanel): def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) + # Translators: This is a label for the initialization for word segmenters for unused languages in the document navigation dialog + initUnusedLangLabel = _("&Initialize word segmenters for unused languages:") + self.initUnusedLangCheckBox: wx.CheckBox = sHelper.addItem(wx.CheckBox(self, label=initUnusedLangLabel)) + self.bindHelpEvent("initWordSegForUnusedLang", self.initUnusedLangCheckBox) + # Translators: This is a label for the word segmentation standard in the document navigation dialog WordNavigationUnitLabel = _("&Word Segmentation Standard:") self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( @@ -3113,6 +3118,7 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = self.initUnusedLangCheckBox.IsChecked() self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 4141d53c81a..b89d74d162d 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -17,6 +17,7 @@ from functools import lru_cache from collections.abc import Callable from typing import Any +import re import textUtils from logHandler import log @@ -81,6 +82,13 @@ def getWordOffsetRange( end = self.wordEnds[index] return (start, end) + @classmethod + def isUsingRelatedLanguage(cls) -> bool: + """Returns True if this strategy is for the current language.""" + return re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) \ + or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) \ + or re.match(cls._LANGUAGE_PATTERN, braille.handler.table.fileName) + class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" @@ -139,6 +147,7 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> class ChineseWordSegmentationStrategy(WordSegmentationStrategy): _lib = None + _LANGUAGE_PATTERN = re.compile(r"^zh", re.IGNORECASE) @classmethod @initializerRegistry @@ -148,8 +157,12 @@ def _initCppJieba(cls): # TODO: make cppjieba alternative set up ctypes signatures. """ import config + import braille + import languageHandler - if not config.conf["general"]["language"].startswith("zh") or cls._lib is not None: + if cls._lib is not None \ + or not (config.conf["documentNavigation"]["initWordSegForUnusedLang"] \ + or cls.isUsingRelatedLanguage()): return try: from NVDAState import ReadPaths From 2eec029e6e9e417ebd18a2b3bf5841dc81059ab1 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 27 Sep 2025 08:40:57 +0800 Subject: [PATCH 67/93] add unittest cases for `WordSegmenter` --- tests/unit/test_textUtils.py | 46 ++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 6993ac7d962..0e6501733ab 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -1,13 +1,13 @@ # A part of NonVisual Desktop Access (NVDA) # This file is covered by the GNU General Public License. # See the file COPYING for more details. -# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter +# Copyright (C) 2019-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong """Unit tests for the textUtils module.""" import unittest -from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter +from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries FACE_PALM = "\U0001f926" # 🤦 @@ -442,3 +442,45 @@ def test_sentenceWithComposites(self): def test_hebrew(self): self._testHelper("בְּרֵאשִׁית", ["בְּ", "רֵ", "א", "שִׁ", "י", "ת"]) + + +class TestWordSegmenter(unittest.TestCase): + """Tests for the WordSegmenter class.""" + + def test_basicLatin(self): + text = "hello world" + segmenter = WordSegmenter(text) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(1), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(2), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(3), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(4), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(5), (0, 6)) + self.assertEqual(segmenter.getSegmentForOffset(6), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(7), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(8), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(9), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(10), (6, 11)) + self.assertEqual(segmenter.getSegmentForOffset(11), (6, 11)) + + def test_chinese(self): + text = "你好世界" + + # ensure that the Chinese segmentation strategy is used + import config + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + temp = config.conf["documentNavigation"]["initWordSegForUnusedLang"] + # ensure the word segmenters for unused languages are initialized + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = True + + ChineseWordSegmentationStrategy._initCppJieba() + segmenter = WordSegmenter(text) + self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(1), (0, 2)) + self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4)) + self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4)) + + # revert the config change + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = temp From f769457298dc546d5ee669939a154576829807f8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:46:03 +0000 Subject: [PATCH 68/93] Pre-commit auto-fix --- source/config/featureFlagEnums.py | 1 + source/gui/settingsDialogs.py | 8 ++++++-- source/textUtils/wordSeg/wordSegStrategy.py | 21 +++++++++++---------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 45ed5a5622e..357a16d2db7 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -138,6 +138,7 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: FontFormattingBrailleModeFlag.TAGS: _("Tags"), } + class InitWordSegForUnusedLnagFlag(DisplayStringEnum): """Boolean flag for whether to initialize the word segmenters for all languages, even if they are not used.""" diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index ed0f91404a5..b26e72ed060 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3094,7 +3094,9 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: # Translators: This is a label for the initialization for word segmenters for unused languages in the document navigation dialog initUnusedLangLabel = _("&Initialize word segmenters for unused languages:") - self.initUnusedLangCheckBox: wx.CheckBox = sHelper.addItem(wx.CheckBox(self, label=initUnusedLangLabel)) + self.initUnusedLangCheckBox: wx.CheckBox = sHelper.addItem( + wx.CheckBox(self, label=initUnusedLangLabel) + ) self.bindHelpEvent("initWordSegForUnusedLang", self.initUnusedLangCheckBox) # Translators: This is a label for the word segmentation standard in the document navigation dialog @@ -3118,7 +3120,9 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): - config.conf["documentNavigation"]["initWordSegForUnusedLang"] = self.initUnusedLangCheckBox.IsChecked() + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = ( + self.initUnusedLangCheckBox.IsChecked() + ) self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index b89d74d162d..610459b2483 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -71,9 +71,9 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> pass def getWordOffsetRange( - self, - offset: int, - ) -> tuple[int, int] | None: + self, + offset: int, + ) -> tuple[int, int] | None: """Helper to get word offset range from a list of word end offsets.""" if not self.wordEnds: return None @@ -85,9 +85,12 @@ def getWordOffsetRange( @classmethod def isUsingRelatedLanguage(cls) -> bool: """Returns True if this strategy is for the current language.""" - return re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) \ - or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) \ + return ( + re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) + or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) or re.match(cls._LANGUAGE_PATTERN, braille.handler.table.fileName) + ) + class UniscribeWordSegmentationStrategy(WordSegmentationStrategy): """Windows Uniscribe-based segmentation (calls NVDAHelper.localLib.calculateWordOffsets).""" @@ -157,12 +160,10 @@ def _initCppJieba(cls): # TODO: make cppjieba alternative set up ctypes signatures. """ import config - import braille - import languageHandler - if cls._lib is not None \ - or not (config.conf["documentNavigation"]["initWordSegForUnusedLang"] \ - or cls.isUsingRelatedLanguage()): + if cls._lib is not None or not ( + config.conf["documentNavigation"]["initWordSegForUnusedLang"] or cls.isUsingRelatedLanguage() + ): return try: from NVDAState import ReadPaths From 9479029f639469a394540d3f5f82d2330c3303a5 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 27 Sep 2025 09:15:51 +0800 Subject: [PATCH 69/93] fixup --- source/textUtils/wordSeg/wordSegStrategy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 610459b2483..b74cdd2ccf3 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -85,6 +85,9 @@ def getWordOffsetRange( @classmethod def isUsingRelatedLanguage(cls) -> bool: """Returns True if this strategy is for the current language.""" + import languageHandler + import braille + return ( re.match(cls._LANGUAGE_PATTERN, languageHandler.getWindowsLanguage()) or re.match(cls._LANGUAGE_PATTERN, languageHandler.getLanguage()) From 9834b686e496ea3a16708d2b075f5e3e453fba71 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 27 Sep 2025 09:16:37 +0800 Subject: [PATCH 70/93] extract punctuation from `wordSegStrategy.py` to `wordSegUtils.py` --- source/textUtils/wordSeg/wordSegStrategy.py | 75 +------------------- source/textUtils/wordSeg/wordSegUtils.py | 77 +++++++++++++++++++++ 2 files changed, 79 insertions(+), 73 deletions(-) create mode 100644 source/textUtils/wordSeg/wordSegUtils.py diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index b74cdd2ccf3..467a0981964 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -273,81 +273,10 @@ def _callCPPJieba(self) -> list[int] | None: pass return None - # Punctuation that should NOT have a separator BEFORE it (no space before these marks) - NO_SEP_BEFORE = { - # Common Chinese fullwidth punctuation - "。", - ",", - "、", - ";", - ":", - "?", - "!", - "…", - "...", - "—", - "–", - "——", - ")", - "】", - "》", - "〉", - "」", - "』", - "”", - "’", - "%", - "‰", - "¥", - # Common ASCII / halfwidth punctuation - ".", - ",", - ";", - ":", - "?", - "!", - "%", - ".", - ")", - "]", - "}", - ">", - '"', - "'", - } - - # Punctuation that should NOT have a separator AFTER it (no space after these marks) - NO_SEP_AFTER = { - # Common Chinese fullwidth opening/leading punctuation - "(", - "【", - "《", - "〈", - "「", - "『", - "“", - "‘", - # Common ASCII / halfwidth opening/leading punctuation - "(", - "[", - "{", - "<", - '"', - "'", - # Currency and prefix-like symbols that typically bind to the following token - "$", - "€", - "£", - "¥", - "₹", - # Social/identifier prefixes - "@", - "#", - "&", - } - def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: """Segments the text using the word end indices.""" + from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER + if len(self.wordEndIndex) <= 1: return self.text diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py new file mode 100644 index 00000000000..8298bda1359 --- /dev/null +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -0,0 +1,77 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Wang Chong +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +# Punctuation that should NOT have a separator BEFORE it (no space before these marks) +NO_SEP_BEFORE = { + # Common Chinese fullwidth punctuation + "。", + ",", + "、", + ";", + ":", + "?", + "!", + "…", + "...", + "—", + "–", + "——", + ")", + "】", + "》", + "〉", + "」", + "』", + "”", + "’", + "%", + "‰", + "¥", + # Common ASCII / halfwidth punctuation + ".", + ",", + ";", + ":", + "?", + "!", + "%", + ".", + ")", + "]", + "}", + ">", + '"', + "'", +} + +# Punctuation that should NOT have a separator AFTER it (no space after these marks) +NO_SEP_AFTER = { + # Common Chinese fullwidth opening/leading punctuation + "(", + "【", + "《", + "〈", + "「", + "『", + "“", + "‘", + # Common ASCII / halfwidth opening/leading punctuation + "(", + "[", + "{", + "<", + '"', + "'", + # Currency and prefix-like symbols that typically bind to the following token + "$", + "€", + "£", + "¥", + "₹", + # Social/identifier prefixes + "@", + "#", + "&", +} From b69d466fd406d6c477aba55048b1a055dfb2198e Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 28 Sep 2025 07:55:26 +0800 Subject: [PATCH 71/93] fix up --- source/gui/settingsDialogs.py | 4 ++-- source/textUtils/wordSeg/wordSegStrategy.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index b26e72ed060..307251705d0 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3093,9 +3093,9 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) # Translators: This is a label for the initialization for word segmenters for unused languages in the document navigation dialog - initUnusedLangLabel = _("&Initialize word segmenters for unused languages:") + initUnusedLangLabel = _("&Initialize Word Segmenters for Unused Languages:") self.initUnusedLangCheckBox: wx.CheckBox = sHelper.addItem( - wx.CheckBox(self, label=initUnusedLangLabel) + wx.CheckBox(self, label=initUnusedLangLabel), ) self.bindHelpEvent("initWordSegForUnusedLang", self.initUnusedLangCheckBox) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 467a0981964..0596a5bf159 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -275,16 +275,17 @@ def _callCPPJieba(self) -> list[int] | None: def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: """Segments the text using the word end indices.""" - from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER - if len(self.wordEndIndex) <= 1: + if len(self.wordEnds) <= 1: return self.text + from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER + result = "" - for sepIndex in range(len(self.wordEndIndex) - 1): - preIndex = 0 if sepIndex == 0 else self.wordEndIndex[sepIndex - 1] - curIndex = self.wordEndIndex[sepIndex] - postIndex = self.wordEndIndex[sepIndex + 1] + for sepIndex in range(len(self.wordEnds) - 1): + preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1] + curIndex = self.wordEnds[sepIndex] + postIndex = self.wordEnds[sepIndex + 1] # append the token before the potential separator position result += self.text[preIndex:curIndex] @@ -298,9 +299,9 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> nextSlice = self.text[curIndex:postIndex] # Determine whether any punctuation forbids a separator BEFORE the next token - noSepBefore = any(nextSlice.startswith(s) for s in self.NO_SEP_BEFORE) + noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE) # Determine whether any punctuation forbids a separator AFTER the current result - noSepAfter = any(result.endswith(s) for s in self.NO_SEP_AFTER) + noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER) if not (noSepBefore or noSepAfter): # If neither side forbids the separator, add it From 6f586fd0898a1bb7868d729876407b4485d067e5 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 28 Sep 2025 07:58:07 +0800 Subject: [PATCH 72/93] update changelog --- user_docs/en/changes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index e066e775dc5..0f74c454401 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -20,6 +20,8 @@ This can be enabled using the "Report when lists support multiple selection" set * VirusTotal scan results are now available in the details for an add-on in the Add-on Store. An action has been added to view the full scan results on the VirusTotal website. (#18974) * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes) +* Chinese text can be navigated by word via build-in input gestures. + Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605) ### Changes @@ -164,6 +166,8 @@ Use `INPUT_TYPE.MOUSE`, `INPUT_TYPE.KEYBOARD`, `KEYEVENTF.KEYUP` and `KEYEVENTF. Use `winBindings.magnification.MAGCOLOREFFECT` instead. (#18958) * `visionEnhancementProviders.screenCurtain.isScreenFullyBlack` is deprecated. Use `NVDAHelper.localLib.isScreenFullyBlack` instead. (#18958) +* `useUniscribe` from `textUtils.offset.OffsetsTextInfo` and its subclasses is deprecated. + Use `charSegFlag` and `wordSegFlag` instead. (#18735) From 3b7bf5fa335301c003e42ca0a3dae23fd04ad65f Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 28 Sep 2025 08:02:59 +0800 Subject: [PATCH 73/93] correct and simplify the offset calculations --- source/textUtils/wordSeg/wordSegUtils.py | 97 ++++++++++++++++++------ 1 file changed, 73 insertions(+), 24 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py index 811a4175ef3..e011d99b555 100644 --- a/source/textUtils/wordSeg/wordSegUtils.py +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -18,34 +18,83 @@ def __init__(self, text: str): super().__init__(text) self.newSepIndex: list[int] = [] self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex) - self.computedStrToEncodedOffsets = list(range(self.strLength)) - for i in range(len(self.computedStrToEncodedOffsets)): - self.computedStrToEncodedOffsets[i] += self._relevantStrToEncodedOffsets[i] - self.computedEncodedToStrOffsets = list(range(self.encodedStringLength)) - for j in range(len(self.computedEncodedToStrOffsets)): - self.computedEncodedToStrOffsets[j] += self._relevantEncodedToStrOffsets[j] @property - def _relevantStrToEncodedOffsets(self) -> list[int]: - relevantIndex: list[int] = [0 for _ in range(self.strLength)] - j = 0 - m = len(self.newSepIndex) - for i in range(self.strLength): - while j < m and self.newSepIndex[j] <= i + j: - j += 1 - relevantIndex[i] = j - return relevantIndex + def computedStrToEncodedOffsets(self) -> list[int]: + """ + Compute a list of offsets so that: + encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex] + + We build an explicit mapping from original string indices to encoded indices + by marking separator positions in the encoded string and then assigning + each non-separator encoded slot to the next original-character index. + The returned list contains the delta (encodedIndex - strIndex) for each + original index. + """ + strLen = self.strLength + encodedLen = self.encodedStringLength + sepCount = len(self.newSepIndex) + + # validate separator positions (optional but makes bugs obvious) + for pos in self.newSepIndex: + if pos < 0 or pos >= encodedLen: + raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") + + # mark which encoded positions are separators + isSep = [False] * encodedLen + for pos in self.newSepIndex: + isSep[pos] = True + + # build explicit str -> encoded mapping + strToEncoded: list[int] = [0] * strLen + nextStrIndex = 0 + for encodedIndex in range(encodedLen): + if not isSep[encodedIndex]: + # assign the current original-char index to this encoded slot + # then advance to the next original index + if nextStrIndex >= strLen: + # defensive: there should not be more non-sep encoded slots than strLen + # but handle gracefully + break + strToEncoded[nextStrIndex] = encodedIndex + nextStrIndex += 1 + + return strToEncoded + @property - def _relevantEncodedToStrOffsets(self) -> list[int]: - relevantIndex: list[int] = [0 for _ in range(self.encodedStringLength)] - j = 0 - m = len(self.newSepIndex) - for i in range(self.encodedStringLength): - while j < m and self.newSepIndex[j] < i + j: - j += 1 - relevantIndex[i] = -j - return relevantIndex + def computedEncodedToStrOffsets(self) -> list[int]: + encodedLen = self.encodedStringLength + strLen = self.strLength + sepCount = len(self.newSepIndex) + + # validate separator positions + for pos in self.newSepIndex: + if pos < 0 or pos >= encodedLen: + raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") + + # mark which encoded positions are separators + isSep = [False] * encodedLen + for pos in self.newSepIndex: + isSep[pos] = True + + # build explicit encoded -> str mapping + # semantics: separator positions and the following encoded character + # both map to the same upcoming original str index (insertion point semantics). + encodedToStr: list[int] = [0] * encodedLen + nextStrIndex = 0 + for encodedIndex in range(encodedLen): + if isSep[encodedIndex]: + # map separator to the next original character index (insertion point) + encodedToStr[encodedIndex] = nextStrIndex + else: + # map this encoded character to the current original index, + # then advance the original index for subsequent positions + encodedToStr[encodedIndex] = nextStrIndex + nextStrIndex += 1 + + return encodedToStr + @cached_property def encodedStringLength(self) -> int: From 251811e4a5ab30f629e9d18f1be95da203d7f548 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sun, 28 Sep 2025 08:04:06 +0800 Subject: [PATCH 74/93] update changelog --- user_docs/en/changes.md | 468 ++++++++++++++++++++-------------------- 1 file changed, 236 insertions(+), 232 deletions(-) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 0f74c454401..703bc507dc8 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -6,28 +6,29 @@ * This release breaks compatibility with existing add-ons. * Windows 8.1 is no longer supported. -Windows 10 is the minimum Windows version supported. -We recommend updating to Windows 11, or when that's not possible, to the latest Windows 10 version (22H2). + Windows 10 is the minimum Windows version supported. + We recommend updating to Windows 11, or when that's not possible, to the latest Windows 10 version (22H2). * 32-bit Windows is no longer supported. ### New Features * Added the possibility to report when multiple items can be selected in a list control. -This can be enabled using the "Report when lists support multiple selection" setting in NVDA's object presentation settings. (#18365 @LeonarddeR) + This can be enabled using the "Report when lists support multiple selection" setting in NVDA's object presentation settings. (#18365 @LeonarddeR) * In Visual Studio Code, the status bar is now reported when using the standard `NVDA+end` (desktop) / `NVDA+shift+end` (laptop) gesture. (#11064, @codeofdusk) * Performance improvements on ARM64 systems, such as with Qualcomm processors. (#18570, @leonarddeR) * While reading text, spelling errors can now be reported with a sound instead of speech. (#4233, @jcsteh, @CyrilleB79) * VirusTotal scan results are now available in the details for an add-on in the Add-on Store. -An action has been added to view the full scan results on the VirusTotal website. (#18974) + An action has been added to view the full scan results on the VirusTotal website. (#18974) * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes) * Chinese text can be navigated by word via build-in input gestures. Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605) +* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605) ### Changes * NVDA no longer supports Windows 8.1. -Windows 10 (Version 1507) is the minimum Windows version supported. -We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl) + Windows 10 (Version 1507) is the minimum Windows version supported. + We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl) * Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667) * When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR) * The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898) @@ -51,13 +52,13 @@ We recommend using Windows 11, or if that is not possible, the latest Windows 10 Please refer to [the developer guide](https://download.nvaccess.org/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process. * Note: this is an Add-on API compatibility breaking release. -Add-ons will need to be re-tested and have their manifest updated. + Add-ons will need to be re-tested and have their manifest updated. * Add-on authors are now able to provide a changelog for an add-on version via the `changelog` manifest key. (#14041, @josephsl) * The changelog should document changes between previous and latest add-on versions, and can be formatted in markdown. * Updated components * Licensecheck has been updated to 2025.1 (#18728, @bramd) * X64 NVDAHelper libraries are now also build for the [ARM64EC architecture](https://learn.microsoft.com/en-us/windows/arm/arm64ec). -On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of their X64 equivalents. (#18570, @leonarddeR) + On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of their X64 equivalents. (#18570, @leonarddeR) * Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for Chinese word segmentation. (#18548, @CrazySteve0605) #### API Breaking Changes @@ -67,17 +68,17 @@ Please open a GitHub issue if your add-on has an issue with updating to the new * NVDA is now built with Python 3.13. (#18591) * typing_extensions have been removed. -These should be supported natively in Python 3.13. (#18689) + These should be supported natively in Python 3.13. (#18689) * `copyrightYears` and `url` have been moved from `versionInfo` to `buildVersion`. (#18682) * Fixed behavior of `TextInfo.collapse()` - previously it was moving TextInfo to the next paragraph in some cases. (#18320, @mltony) * Fixed behavior of `OffsetTextInfo.move()` - previously it wouldn't move to the very end of the document unless moving by character. (#18348, @mltony) * `NVDAHelper.localLib` is now a module, not a `ctypes.CDLL`. -Most API consumers should not be impacted by this change. -Use `NVDAHelper.localLib.dll` for access to the `ctypes.CDLL` if necessary. (#18207) + Most API consumers should not be impacted by this change. + Use `NVDAHelper.localLib.dll` for access to the `ctypes.CDLL` if necessary. (#18207) * `UIAHandler.autoSelectDetectionAvailable` is removed with no replacement. (#18684, @josephsl) * The following symbols have been removed from `installer` with no direct replacement: `programFilesPath`, `getStartMenuFolder`, `getInstallPath`. (#18851) * The `bool` configuration key `[documentFormatting][reportSpellingErrors]` has been removed. -Use the `int` configuration key `[reportSpellingErrors2]` instead. (#17997, @CyrilleB79) + Use the `int` configuration key `[reportSpellingErrors2]` instead. (#17997, @CyrilleB79) * `NVDAObjects.window.GhostWindowFromHungWindow` has been removed with no replacement. (#18883) * `winUser.Input_I` and `winUser.PUL` have been removed, with no replacement. (#18883) * The `inputButtonCaps` property on `hwIo.hid.Hid` objects now correctly returns an array of `hidpi.HIDP_BUTTON_CAPS` structures rather than HIDP_VALUE_CAPS` structures. (#18902) @@ -86,27 +87,27 @@ Use the `int` configuration key `[reportSpellingErrors2]` instead. (#17997, @Cyr * the `rgpszUsageIdentifier` member of the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation. * The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes) * `visionEnhancementProviders.screenCurtain.Magnification` has been removed. -All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) + All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) #### Deprecations * `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl): * `NVDAHelper.versionedLibPath` is deprecated. -Use `NVDAState.ReadPaths.versionedLibX86Path` instead. (#18207) + Use `NVDAState.ReadPaths.versionedLibX86Path` instead. (#18207) * `NVDAHelper.coreArchLibPath` is deprecated. -Use `NVDAState.ReadPaths.coreArchLibPath` instead. (#18207) + Use `NVDAState.ReadPaths.coreArchLibPath` instead. (#18207) * `NVDAHelper.LOCAL_WIN10_DLL_PATH` is deprecated. -Use `NVDAState.ReadPaths.nvdaHelperLocalWin10Dll` instead. (#18207) + Use `NVDAState.ReadPaths.nvdaHelperLocalWin10Dll` instead. (#18207) * `NVDAHelper.generateBeep` is deprecated. -Use `NVDAHelper.localLib.generateBeep` instead. (#18207) + Use `NVDAHelper.localLib.generateBeep` instead. (#18207) * `NVDAHelper.VBuf_getTextInRange` is deprecated. -Use `NVDAHelper.localLib.VBuf_getTextInRange` instead. (#18207) + Use `NVDAHelper.localLib.VBuf_getTextInRange` instead. (#18207) * `NVDAHelper.onSsmlMarkReached` is deprecated. -Use `NVDAHelper.localLib.nvdaController_onSsmlMarkReached` instead. (#18207) + Use `NVDAHelper.localLib.nvdaController_onSsmlMarkReached` instead. (#18207) * `NVDAObjects.window.excel.ExcelCellInfo` is deprecated. -Use `NVDAHelper.localLib.EXCEL_CELLINFO` instead. (#18207) + Use `NVDAHelper.localLib.EXCEL_CELLINFO` instead. (#18207) * `nvwave.WAVEFORMATEX` is deprecated. -Use `winBindings.mmeapi.WAVEFORMATEX` instead. (#18207) + Use `winBindings.mmeapi.WAVEFORMATEX` instead. (#18207) * The following symbols have been moved from `winuser` to `winBindings.user32`: `GUITHREADINFO`, `HWINEVENTHOOK`, `WINEVENTPROC`, `WNDCLASSEXW`, `WNDPROC`, `PAINTSTRUCT`. (#18207, #18883) * The following symbols have been moved from `hwPortUtils` to `winBindings.bthprops`: `BLUETOOTH_ADDRESS`, `BLUETOOTH_DEVICE_INFO`, `BLUETOOTH_MAX_NAME_SIZE`, `BluetoothGetDeviceInfo`. Access to these symbols via `hwPortUtils` is deprecated. (#18571) @@ -160,16 +161,17 @@ Use `winBindings.mmeapi.WAVEFORMATEX` instead. (#18207) * The `LVS_*` constants from `NVDAObjects.IAccessible.sysListView32` are deprecated. Use the `ListViewWindowStyle` enumeration instead. (#18926 , @LeonarddeR) * The `INPUT_MOUSE`, `INPUT_KEYBOARD`, `KEYEVENTF_KEYUP` and `KEYEVENTF_UNICODE` constants from `winUser` are deprecated. -Use `INPUT_TYPE.MOUSE`, `INPUT_TYPE.KEYBOARD`, `KEYEVENTF.KEYUP` and `KEYEVENTF.UNICODE` from `winBindings.user32` instead. (#18947) + Use `INPUT_TYPE.MOUSE`, `INPUT_TYPE.KEYBOARD`, `KEYEVENTF.KEYUP` and `KEYEVENTF.UNICODE` from `winBindings.user32` instead. (#18947) * The following symbols have been moved from `updateCheck` to `winBindings.crypt32`: `CERT_USAGE_MATCH`, `CERT_CHAIN_PARA`. (#18956) * `visionEnhancementProviders.screenCurtain.MAGCOLOREFFECT` is deprecated. -Use `winBindings.magnification.MAGCOLOREFFECT` instead. (#18958) + Use `winBindings.magnification.MAGCOLOREFFECT` instead. (#18958) * `visionEnhancementProviders.screenCurtain.isScreenFullyBlack` is deprecated. -Use `NVDAHelper.localLib.isScreenFullyBlack` instead. (#18958) + Use `NVDAHelper.localLib.isScreenFullyBlack` instead. (#18958) * `useUniscribe` from `textUtils.offset.OffsetsTextInfo` and its subclasses is deprecated. Use `charSegFlag` and `wordSegFlag` instead. (#18735) + ## 2025.3.1 @@ -209,15 +211,15 @@ Localisation data for emojis has been added for Belarusian and Bosnian. * Component updates: * Updated eSpeak NG to [commit `3b8ef3d`](https://github.com/espeak-ng/espeak-ng/commit/3b8ef3d310f380e9ab4c6b19bf8367d8f99ac285). - There have been improvements to Farsi/Persian. (#18342, #18633, @codeofdusk) + There have been improvements to Farsi/Persian. (#18342, #18633, @codeofdusk) * Updated Unicode CLDR to [47.0](https://cldr.unicode.org/downloads/cldr-47). - Localisation data for emojis has been added for Belarusian and Bosnian. (#18581) + Localisation data for emojis has been added for Belarusian and Bosnian. (#18581) * Braille: * When braille word wrap is enabled, all braille cells will be used if the next character is a space. (#18016, @nvdaes) * NVDA no longer resets braille tables to automatic when changing its language. (#18538, @LeonarddeR) * NVDA no longer handles Turkish grade 1 as Turkish 8 dot computer braille. (#18758, @OzancanKaratas) * The Dot Pad braille display driver now supports automatic detection of USB-connected devices. - Note that this is disabled by default due to the device using generic USB identifiers, but can be enabled in braille settings. (#18444, @bramd) + Note that this is disabled by default due to the device using generic USB identifiers, but can be enabled in braille settings. (#18444, @bramd) * When the selection covers more than one cell in Microsoft Excel, pressing `tab` or `enter` to move the active cell now reports the new active cell rather than the whole selection. (#6959, @CyrilleB79) * In terminal programs on Windows 10 version 1607 and later, the calculation of changed text now runs within NVDA instead of via an external process, which may improve performance and reliability. (#18480, @codeofdusk) * The NVDA Remote Access connection dialog now remembers the most recent connection mode, server type and locally hosted port of manual connections. (#18512, #18701) @@ -262,7 +264,7 @@ Please refer to [the developer guide](https://download.nvaccess.org/documentatio * `SynthDriver.isSpeaking` * `easeOfAccess.RegistryKey` and `config.RegistryKey` is deprecated, use `config.registry.RegistryKey` instead. (#18608) * Importing `DEFAULT_EXTENSIONS` from `md2html` is deprecated. -Importing from `md2html` is discouraged. (#18638) + Importing from `md2html` is discouraged. (#18638) ## 2025.2 @@ -354,7 +356,7 @@ Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/docume * A `languageHandling` module has been added to report the language within speech sequences. * `LangChangeCommand` of `speech.commands` includes static methods to determine if NVDA should get the language of the text being read, and switch synthesizer voice. * The `brailleTables` module is now a package. -The several built-in table definitions are moved to the `__tables` module in that package. (#18194, @LeonarddeR) + The several built-in table definitions are moved to the `__tables` module in that package. (#18194, @LeonarddeR) * Microsoft SQL Server Management Studio now uses the Visual Studio app module, as SSMS is based on Visual Studio. (#18176, @LeonarddeR) * NVDA will report Windows release revision number (for example: 10.0.26100.0) when `winVersion.getWinVer` is called and log this information at startup. (#18266, @josephsl) @@ -406,9 +408,9 @@ New Coptic, compact Cuneiform, and Portuguese 6 dot Computer Braille tables are Please responsibly disclose security issues following NVDA's [security policy](https://github.com/nvaccess/nvda/blob/master/security.md). * Prevents showing potentially sensitive information on braille displays when the device is locked. -([GHSA-8f8q-2jc3-6rf4](https://github.com/nvaccess/nvda/security/advisories/GHSA-8f8q-2jc3-6rf4)) + ([GHSA-8f8q-2jc3-6rf4](https://github.com/nvaccess/nvda/security/advisories/GHSA-8f8q-2jc3-6rf4)) * Prevents the installer from loading unwanted DLLs from its directory. -([GHSA-qf5h-qw92-rx2f](https://github.com/nvaccess/nvda/security/advisories/GHSA-qf5h-qw92-rx2f)) + ([GHSA-qf5h-qw92-rx2f](https://github.com/nvaccess/nvda/security/advisories/GHSA-qf5h-qw92-rx2f)) ### New Features @@ -457,7 +459,7 @@ Please responsibly disclose security issues following NVDA's [security policy](h * NVDA is now able to report caret changes when pressing `alt+upArrow` or `alt+downArrow` gestures, for example in Visual Studio Code. (#17652, @LeonarddeR) * Added commands to move the review cursor to the first and last character of the selected text, assigned to `NVDA+alt+home` and `NVDA+alt+end`, respectively. (#17299, @nvdaes) * Added a general setting to prevent the display turning off during say all or reading with braille. -This option is enabled by default, but may result in increased battery depletion. (#17649, @LeonarddeR) + This option is enabled by default, but may result in increased battery depletion. (#17649, @LeonarddeR) * NVDA is now translated into Bosnian. (#17953) * In Adobe Acrobat, NVDA can now read and interact with math equations in PDF documents generated by recent versions of Microsoft Word. (#18056) @@ -478,7 +480,7 @@ This option is enabled by default, but may result in increased battery depletion * The "Contributors" file has been removed from the NVDA menu. (#16922) * The NVDA license is now formatted in HTML to improve readability. (#17600) * Short versions of the most commonly used command line options have been added: `-d` for `--disable-addons` and `-n` for `--lang`. - Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` is no longer supported. (#11644, @CyrilleB79) + Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` is no longer supported. (#11644, @CyrilleB79) * Changes to the COM Registration Fixing Tool: (#12355, @XLTechie) * It now starts with a more user friendly explanation of its purpose, instead of a warning. (#12351) * The initial window can now be exited with `escape` or `alt+f4`. (#10799) @@ -487,7 +489,7 @@ This option is enabled by default, but may result in increased battery depletion * Microsoft Speech API version 5 and Microsoft Speech Platform voices now use WASAPI for audio output, which may improve the responsiveness of those voices. (#13284, @gexgd0419) * The silence at the beginning of speech will now be trimmed when using OneCore voices, SAPI5 voices, and some third-party voice add-ons to improve their responsiveness. (#17614, @gexgd0419) * Microsoft Speech API version 4 voices now use WASAPI for audio output, so that they can work with features such as audio ducking, leading silence trimming, and keeping audio device awake. - If this does not work with your SAPI 4 voice, you can disable WASAPI for SAPI 4 in Advanced settings. (#17718, #17801, @gexgd0419) + If this does not work with your SAPI 4 voice, you can disable WASAPI for SAPI 4 in Advanced settings. (#17718, #17801, @gexgd0419) * Unicode Normalization is now enabled by default for speech output. (#17017, @LeonarddeR). * You can still disable this functionality in the Speech category of the NVDA Settings dialog. * The keyboard settings for "Speak typed characters" and "Speak typed words" now have three options: Off, Only in edit controls, and Always. (#17505, @Cary-rowen) @@ -520,14 +522,14 @@ This option is enabled by default, but may result in increased battery depletion * The current checkbox state (checked/unchecked) of checkboxes in dialogs is now also reported in braille, not just speech. (#17218, @michaelweghorn) * Math: * Math reading has been fixed for some web elements. - Specifically, MathML inside of span and other elements that have the attribute `role="math"`. (#15058) + Specifically, MathML inside of span and other elements that have the attribute `role="math"`. (#15058) * Math equations only represented by an image and alt text with no MathML for rich navigation, are now treated like normal images, rather than math with no content, allowing the user to jump to them with `g` and to be able to arrow through the alt text by character. (#16007) * IDEs: * NVDA will no longer crash when selecting all text in certain source files in Android Studio or IntelliJ Idea. (#17418, @thgcode) * In Visual Studio Code, NVDA no longer hijacks the `alt+upArrow` and `alt+downArrow` gestures for sentence navigation. (#17082, @LeonarddeR) * Certain section elements are now correctly recognized as editable controls in Visual Studio Code. (#17573, @Cary-rowen) * In Notepad and other UIA documents, and Notepad++ documents on Windows 11, if the last line is empty, the "braille next line command" will move the cursor to the last line. - In any document, if the cursor is on the last line, it will be moved to the end when using this command. (#17251, #17430, @nvdaes) + In any document, if the cursor is on the last line, it will be moved to the end when using this command. (#17251, #17430, @nvdaes) * Configuration profiles: * Braille is no longer dysfunctional when activating "say all" with an associated configuration profile. (#17163, @LeonarddeR) * Fixed an issue where certain settings were explicitly saved to the active configuration profile even when the value of that setting was equal to the value in the base configuration. (#17157, @leonarddeR) @@ -549,7 +551,7 @@ This option is enabled by default, but may result in increased battery depletion Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process. * Note: this is an Add-on API compatibility breaking release. -Add-ons will need to be re-tested and have their manifest updated. + Add-ons will need to be re-tested and have their manifest updated. * Component updates: * Updated Ruff to 0.8.1. (#17102, #17260, #17473) * Updated Comtypes to 1.4.6. (#17061, @LeonarddeR) @@ -571,11 +573,11 @@ Add-ons will need to be re-tested and have their manifest updated. * A new `utils.urlUtils` module with different functions to determine link types * A new `INTERNAL_LINK` state has been added to `controlTypes.states.State` * A new `linkType` property has been added on `NVDAObject`. - It queries the `treeInterceptor` by default, if any. + It queries the `treeInterceptor` by default, if any. * `BrowseModeTreeInterceptor` object has a new `documentUrl` property * `BrowseModeTreeInterceptor` object has a new `getLinkTypeInDocument` method which accepts an URL to check the link type of the object * A `toggleBooleanValue` helper function has been added to `globalCommands`. - It can be used in scripts to report the result when a boolean is toggled in `config.conf` + It can be used in scripts to report the result when a boolean is toggled in `config.conf` * Removed the requirement to indent function parameter lists by two tabs from NVDA's Coding Standards, to be compatible with modern automatic linting. (#17126, @XLTechie) * Added the [VS Code workspace configuration for NVDA](https://nvaccess.org/nvaccess/vscode-nvda) as a git submodule. (#17003) * A new function, `gui.guiHelper.wxCallOnMain`, has been added, which allows safely and synchronously calling wx functions from non-GUI threads, and getting their return value. (#17304) @@ -606,9 +608,9 @@ These are breaking API changes. Please open a GitHub issue if your add-on has an issue with updating to the new API. * The `addonStore.network.BASE_URL` constant has been removed. -As the Add-on Store base URL is now configurable directly within NVDA, no replacement is planned. (#17099) + As the Add-on Store base URL is now configurable directly within NVDA, no replacement is planned. (#17099) * The `updateCheck.CHECK_URL` constant has been removed. -As the NVDA update check URL is now configurable directly within NVDA, no replacement is planned. (#17151) + As the NVDA update check URL is now configurable directly within NVDA, no replacement is planned. (#17151) * `NVDAObjects.UIA.winConsoleUIA.WinTerminalUIA` has been removed with no public replacement. (#14047, #16820, @codeofdusk) * `NVDAObjects.IAccessible.ia2TextMozilla.FakeEmbeddingTextInfo` has been removed. (#16768, @jcsteh) * The following symbols in `appModules.soffice` have been renamed (#6915, @michaelweghorn): @@ -621,9 +623,9 @@ As the NVDA update check URL is now configurable directly within NVDA, no replac * Due to the retirement of NVDA's winmm support (#17496, #17532, #17678): * The following symbols have been removed from `nvwave` without replacements: `CALLBACK_EVENT`, `CALLBACK_FUNCTION`, `CALLBACK_NULL`, `HWAVEOUT`, `LPHWAVEOUT`, `LPWAVEFORMATEX`, `LPWAVEHDR`, `MAXPNAMELEN`, `MMSYSERR_NOERROR`, `usingWasapiWavePlayer`, `WAVEHDR`, `WAVEOUTCAPS`, `waveOutProc`, `WAVE_MAPPER`, `WHDR_DONE`, `WinmmWavePlayer`, and `winmm`. * The following symbols have been removed from `nvwave`: `getOutputDeviceNames`, `outputDeviceIDToName`, `outputDeviceNameToID`. - Use `utils.mmdevice.getOutputDevices` instead. + Use `utils.mmdevice.getOutputDevices` instead. * `nvwave.WasapiWavePlayer` has been renamed to `WavePlayer`. - Additionally, the method signature of its `__init__` has changed as follows: + Additionally, the method signature of its `__init__` has changed as follows: * The `outputDevice` parameter should now only be passed string arguments. * The deprecated `closeWhenIdle` and `buffered` parameters have been removed. * `gui.settingsDialogs.AdvancedPanelControls.wasapiComboBox` has been removed. @@ -635,13 +637,13 @@ As the NVDA update check URL is now configurable directly within NVDA, no replac * Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` is no longer supported. (#11644, @CyrilleB79) * The `useAsFallBack` keyword argument of `bdDetect.DriverRegistrar` has been renamed to `useAsFallback`. (#17521, @LeonarddeR) * The `[addonStore][showWarning]` configuration setting has been removed. -Instead use `addonStore.dataManager.addonDataManager.storeSettings.showWarning`. (#17597) + Instead use `addonStore.dataManager.addonDataManager.storeSettings.showWarning`. (#17597) * `ui.browseableMessage` now takes a parameter `sanitizeHtmlFunc`. -This defaults to `nh3.clean` with default arguments. -This means any HTML passed into `ui.browseableMessage` using `isHtml=True` is now sanitized by default. -To change sanitization rules, such as whitelisting tags or attributes, create a function that calls `nh3.clean` with the desired parameters. (#16985) + This defaults to `nh3.clean` with default arguments. + This means any HTML passed into `ui.browseableMessage` using `isHtml=True` is now sanitized by default. + To change sanitization rules, such as whitelisting tags or attributes, create a function that calls `nh3.clean` with the desired parameters. (#16985) * `updateCheck.UpdateAskInstallDialog` no longer automatically performs an action when the update or postpone buttons are pressed. -Instead, a `callback` property has been added, which returns a function that performs the appropriate action when called with the return value from the dialog. (#17582) + Instead, a `callback` property has been added, which returns a function that performs the appropriate action when called with the return value from the dialog. (#17582) * Dialogs opened with `gui.runScriptModalDialog` are now recognised as modal by NVDA. (#17582) * The following API symbols related to the setting "Automatically set system focus to focusable elements" have been removed with no replacement: (#17598) * `globalCommands.GlobalCommands.script_toggleAutoFocusFocusableElements` @@ -663,9 +665,9 @@ Instead, a `callback` property has been added, which returns a function that per #### Deprecations * The `braille.filter_displaySize` extension point is deprecated. -Please use `braille.filter_displayDimensions` instead. (#17011) + Please use `braille.filter_displayDimensions` instead. (#17011) * The `gui.message.messageBox` and `gui.runScriptModalDialog` functions, and `gui.nvdaControls.MessageDialog` class are deprecated. -Use `gui.message.MessageDialog` instead. (#17582) + Use `gui.message.MessageDialog` instead. (#17582) * The following symbols are deprecated (#17486, @CyrilleB79): * `NoConsoleOptionParser`, `stringToBool`, `stringToLang` in `__main__`; use the same symbols in `argsParsing` instead. * `__main__.parser`; use `argsParsing.getParser()` instead. @@ -855,7 +857,7 @@ Unicode CLDR has also been updated. * In the Python console, the last unexecuted command will no longer be lost when moving in the input history. (#16653, @CyrilleB79) * A unique anonymous ID is now sent as part of optional NVDA usage statistics gathering. (#16266) * By default, a new folder will be created when making a portable copy. -A warning message will inform you if you try writing to a non-empty directory. (#16686) + A warning message will inform you if you try writing to a non-empty directory. (#16686) ### Bug Fixes @@ -890,7 +892,7 @@ A warning message will inform you if you try writing to a non-empty directory. ( * NVDA now uses Ruff instead of flake8 for linting. (#14817) * Fixed NVDA's build system to work properly when using Visual Studio 2022 version 17.10 and above. (#16480, @LeonarddeR) * A fixed width font is now used in Log Viewer and in the NVDA Python Console so that the cursor remains in the same column during vertical navigation. -It is especially useful to read the error location markers in tracebacks. (#16321, @CyrilleB79) + It is especially useful to read the error location markers in tracebacks. (#16321, @CyrilleB79) * Support for custom braille tables has been added. (#3304, #16208, @JulienCochuyt, @LeonarddeR) * Tables can be provided in the `brailleTables` folder in an add-on package. * Table metadata can be added to an optional `brailleTables` section in the add-on manifest or to a `.ini` file with the same format found in the brailleTables subdirectory of the scratchpad directory. @@ -900,7 +902,7 @@ It is especially useful to read the error location markers in tracebacks. (#1632 * `wx.CallAfter`, which is wrapped in `monkeyPatches/wxMonkeyPatches.py`, now includes proper `functools.wraps` indication. (#16520, @XLTechie) * There is a new module for scheduling tasks `utils.schedule`, using the pip module `schedule`. (#16636) * You can use `scheduleThread.scheduleDailyJobAtStartUp` to automatically schedule a job that happens after NVDA starts, and every 24 hours after that. - Jobs are scheduled with a delay to avoid conflicts. + Jobs are scheduled with a delay to avoid conflicts. * `scheduleThread.scheduleDailyJob` and `scheduleJob` can be used to schedule jobs at custom times, where a `JobClashError` will be raised on a known job scheduling clash. * It is now possible to create app modules for apps hosting Edge WebView2 (msedgewebview2.exe) controls. (#16705, @josephsl) @@ -1029,7 +1031,7 @@ There are many bug fixes, particularly for the Add-on Store, braille, Libre Offi * This release breaks compatibility with existing add-ons. * Windows 7, and Windows 8 are no longer supported. -Windows 8.1 is the minimum Windows version supported. + Windows 8.1 is the minimum Windows version supported. ### New Features @@ -1038,20 +1040,20 @@ Windows 8.1 is the minimum Windows version supported. * A new action has been added to open a dedicated webpage to see or provide feedback about the selected add-on. (#15576, @nvdaes) * Added support for Bluetooth Low Energy HID Braille displays. (#15470) * A new Native Selection mode (toggled by `NVDA+shift+f10`) is now available in NVDA's browse mode for Mozilla Firefox. -When turned on, selecting text in browse mode will also manipulate Firefox's own native selection. -Copying text with `control+c` will pass straight through to Firefox, thus copying the rich content, rather than NVDA's plain text representation. -Note however that as Firefox is handling the actual copy, NVDA will not report a "copy to clipboard" message in this mode. (#15830) + When turned on, selecting text in browse mode will also manipulate Firefox's own native selection. + Copying text with `control+c` will pass straight through to Firefox, thus copying the rich content, rather than NVDA's plain text representation. + Note however that as Firefox is handling the actual copy, NVDA will not report a "copy to clipboard" message in this mode. (#15830) * When copying text in Microsoft Word with NVDA's browse mode enabled, formatting is now also included. -A side affect of this is that NVDA will no longer report a "copy to clipboard" message when pressing `control+c` in Microsoft Word / Outlook browse mode, as the application is now handling the copy, not NVDA. (#16129) + A side affect of this is that NVDA will no longer report a "copy to clipboard" message when pressing `control+c` in Microsoft Word / Outlook browse mode, as the application is now handling the copy, not NVDA. (#16129) * A new "on-demand" speech mode has been added. -When speech is on-demand, NVDA does not speak automatically (e.g. when moving the cursor) but still speaks when calling commands whose goal is explicitly to report something (e.g. report window title). (#481, @CyrilleB79) + When speech is on-demand, NVDA does not speak automatically (e.g. when moving the cursor) but still speaks when calling commands whose goal is explicitly to report something (e.g. report window title). (#481, @CyrilleB79) * In the Speech category of NVDA's settings, it is now possible to exclude unwanted speech modes from the Cycle speech modes command (`NVDA+s`). (#15806, @lukaszgo1) * If you are currently using the NoBeepsSpeechMode add-on consider uninstalling it, and disabling "beeps" and "on-demand" modes in the settings. ### Changes * NVDA no longer supports Windows 7 and Windows 8. -Windows 8.1 is the minimum Windows version supported. (#15544) + Windows 8.1 is the minimum Windows version supported. (#15544) * Component updates: * Updated LibLouis braille translator to [3.28.0](https://github.com/liblouis/liblouis/releases/tag/v3.28.0). (#15435, #15876, @codeofdusk) * Added new Thai, Romanian, and Filipino Braille tables. @@ -1066,12 +1068,12 @@ Windows 8.1 is the minimum Windows version supported. (#15544) * When requesting formatting information on Excel cells, borders and background will only be reported if there is such formatting. (#15560, @CyrilleB79) * NVDA will again no longer report unlabelled groupings such as in recent versions of Microsoft Office 365 menus. (#15638) * The audio output device and ducking mode options have been removed from the "Select Synthesizer" dialog. -They can be found in the audio settings panel which can be opened with `NVDA+control+u`. (#15512, @codeofdusk) + They can be found in the audio settings panel which can be opened with `NVDA+control+u`. (#15512, @codeofdusk) * The option "Report role when mouse enters object" in NVDA's mouse settings category has been renamed to "Report object when mouse enters it". -This option now announces additional relevant information about an object when the mouse enters it, such as states (checked/pressed) or cell coordinates in a table. (#15420, @LeonarddeR) + This option now announces additional relevant information about an object when the mouse enters it, such as states (checked/pressed) or cell coordinates in a table. (#15420, @LeonarddeR) * New items have been added to the Help menu for the NV Access "Get Help" page and Shop. (#14631) * NVDA's support for [Poedit](https://poedit.net) is overhauled for Poedit version 3 and above. -Users of Poedit 1 are encouraged to update to Poedit 3 if they want to rely on enhanced accessibility in Poedit, such as shortcuts to read translator notes and comments. (#15313, #7303, @LeonarddeR) + Users of Poedit 1 are encouraged to update to Poedit 3 if they want to rely on enhanced accessibility in Poedit, such as shortcuts to read translator notes and comments. (#15313, #7303, @LeonarddeR) * Braille viewer and speech viewer are now disabled in secure mode. (#15680) * During object navigation, disabled (unavailable) objects will not be ignored anymore. (#15477, @CyrilleB79) * Added table of contents to key commands document. (#16106) @@ -1100,12 +1102,12 @@ Users of Poedit 1 are encouraged to update to Poedit 3 if they want to rely on e * Words deleted using the `control+backspace` keyboard shortcut are now also properly announced when the deleted word is followed by whitespace (like spaces and tabs). (#15436, @michaelweghorn) * Announcement of the status bar using the `NVDA+end` keyboard shortcut now also works for dialogs in LibreOffice version 24.2 and newer. (#15591, @michaelweghorn) * All expected text attributes are now supported in LibreOffice versions 24.2 and above. - This makes the announcement of spelling errors work when announcing a line in Writer. (#15648, @michaelweghorn) + This makes the announcement of spelling errors work when announcing a line in Writer. (#15648, @michaelweghorn) * Announcement of heading levels now also works for LibreOffice versions 24.2 and newer. (#15881, @michaelweghorn) * Microsoft Office: * In Excel with UIA disabled, braille is updated, and the active cell content is spoken, when `control+y`, `control+z` or `alt+backspace` is pressed. (#15547) * In Word with UIA disabled braille is updated when `control+v`, `control+x`, `control+y`, `control+z`, `alt+backspace`, `backspace` or `control+backspace` is pressed. - It is also updated with UIA enabled, when typing text and braille is tethered to review and review follows caret. (#3276) + It is also updated with UIA enabled, when typing text and braille is tethered to review and review follows caret. (#3276) * In Word, the landing cell will now be correctly reported when using the native Word commands for table navigation `alt+home`, `alt+end`, `alt+pageUp` and `alt+pageDown`. (#15805, @CyrilleB79) * Reporting of object shortcut keys has been improved. (#10807, #15816, @CyrilleB79) * The SAPI4 synthesizer now properly supports volume, rate and pitch changes embedded in speech. (#15271, @LeonarddeR) @@ -1130,9 +1132,9 @@ Users of Poedit 1 are encouraged to update to Poedit 3 if they want to rely on e Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process. * Note: this is an Add-on API compatibility breaking release. -Add-ons will need to be re-tested and have their manifest updated. + Add-ons will need to be re-tested and have their manifest updated. * Building NVDA now requires Visual Studio 2022. -Please refer to the [NVDA docs](https://github.com/nvaccess/nvda/blob/release-2024.1/projectDocs/dev/createDevEnvironment.md) for the specific list of Visual Studio components. (#14313) + Please refer to the [NVDA docs](https://github.com/nvaccess/nvda/blob/release-2024.1/projectDocs/dev/createDevEnvironment.md) for the specific list of Visual Studio components. (#14313) * Added the following extension points: * `treeInterceptorHandler.post_browseModeStateChange`. (#14969, @nvdaes) * `speech.speechCanceled`. (#15700, @LeonarddeR) @@ -1140,7 +1142,7 @@ Please refer to the [NVDA docs](https://github.com/nvaccess/nvda/blob/release-20 * It is now possible to use plural forms in an add-on's translations. (#15661, @beqabeqa473) * Included python3.dll in the binary distribution for use by add-ons with external libraries utilizing the [stable ABI](https://docs.python.org/3.11/c-api/stable.html). (#15674, @mzanm) * The `BrailleDisplayDriver` base class now has `numRows` and `numCols` properties to provide information about multi line braille displays. -Setting `numCells` is still supported for single line braille displays and `numCells` will return the total number of cells for multi line braille displays. (#15386) + Setting `numCells` is still supported for single line braille displays and `numCells` will return the total number of cells for multi line braille displays. (#15386) * Updated BrlAPI for BRLTTY to version 0.8.5, and its corresponding python module to a Python 3.11 compatible build. (#15652, @LeonarddeR) * Added the `speech.speakSsml` function, which allows you to write NVDA speech sequences using [SSML](https://www.w3.org/TR/speech-synthesis11/). (#15699, @LeonarddeR) * The following tags are currently supported and translated to appropriate NVDA speech commands: @@ -1176,7 +1178,7 @@ Setting `numCells` is still supported for single line braille displays and `numC * Scripts that perform an action (e.g. move the cursor, change a parameter) should not speak in the "on-demand" mode. * Fixed bug where deleting git-tracked files during `scons -c` resulted in missing UIA COM interfaces on rebuild. (#7070, #10833, @hwf1324) * Fix a bug where some code changes were not detected when building `dist`, that prevented a new build from being triggered. -Now `dist` always rebuilds. (#13372, @hwf1324) + Now `dist` always rebuilds. (#13372, @hwf1324) * A `gui.nvdaControls.MessageDialog` with default type of standard, no longer throws a None conversion exception because no sound is assigned. (#16223, @XLTechie) #### API Breaking Changes @@ -1198,18 +1200,18 @@ Please open a GitHub issue if your Add-on has an issue with updating to the new * typing_extensions, these should be supported natively in Python 3.11 (#15544) * nose, instead unittest-xml-reporting is used to generate XML reports. (#15544) * `IAccessibleHandler.SecureDesktopNVDAObject` has been removed. -Instead, when NVDA is running on the user profile, track the existence of the secure desktop with the extension point: `winAPI.secureDesktop.post_secureDesktopStateChange`. (#14488) + Instead, when NVDA is running on the user profile, track the existence of the secure desktop with the extension point: `winAPI.secureDesktop.post_secureDesktopStateChange`. (#14488) * `braille.BrailleHandler.handlePendingCaretUpdate` has been removed with no public replacement. (#15163, @LeonarddeR) * `bdDetect.addUsbDevices and bdDetect.addBluetoothDevices` have been removed. -Braille display drivers should implement the `registerAutomaticDetection` class method instead. -That method receives a `DriverRegistrar` object on which the `addUsbDevices` and `addBluetoothDevices` methods can be used. (#15200, @LeonarddeR) + Braille display drivers should implement the `registerAutomaticDetection` class method instead. + That method receives a `DriverRegistrar` object on which the `addUsbDevices` and `addBluetoothDevices` methods can be used. (#15200, @LeonarddeR) * The default implementation of the check method on `BrailleDisplayDriver` now requires both the `threadSafe` and `supportsAutomaticDetection` attributes to be set to `True`. (#15200, @LeonarddeR) * Passing lambda functions to `hwIo.ioThread.IoThread.queueAsApc` is no longer possible, as functions should be weakly referenceable. (#14627, @LeonarddeR) * `IoThread.autoDeleteApcReference` has been removed. (#14924, @LeonarddeR) * To support capital pitch changes, synthesizers must now explicitly declare their support for the `PitchCommand` in the `supportedCommands` attribute on the driver. (#15433, @LeonarddeR) * `speechDictHandler.speechDictVars` has been removed. Use `NVDAState.WritePaths.speechDictsDir` instead of `speechDictHandler.speechDictVars.speechDictsPath`. (#15614, @lukaszgo1) * `languageHandler.makeNpgettext` and `languageHandler.makePgettext` have been removed. -`npgettext` and `pgettext` are supported natively now. (#15546) + `npgettext` and `pgettext` are supported natively now. (#15546) * The app module for [Poedit](https://poedit.net) has been changed significantly. The `fetchObject` function has been removed. (#15313, #7303, @LeonarddeR) * The following redundant types and constants have been removed from `hwPortUtils`: (#15764, @LeonarddeR) * `PCWSTR` @@ -1222,35 +1224,35 @@ That method receives a `DriverRegistrar` object on which the `addUsbDevices` and * `touchHandler.TouchInputGesture.multiFingerActionLabel` has been removed with no replacement. (#15864, @CyrilleB79) * `NVDAObjects.IAccessible.winword.WordDocument.script_reportCurrentHeaders` has been removed with no replacement. (#15904, @CyrilleB79) * The following app modules are removed. -Code which imports from one of them, should instead import from the replacement module. (#15618, @lukaszgo1) - -| Removed module name |Replacement module| -|---|---| -|`azardi-2.0` |`azardi20`| -|`azuredatastudio` |`code`| -|`azuredatastudio-insiders` |`code`| -|`calculatorapp` |`calculator`| -|`code - insiders` |`code`| -|`commsapps` |`hxmail`| -|`dbeaver` |`eclipse`| -|`digitaleditionspreview` |`digitaleditions`| -|`esybraille` |`esysuite`| -|`hxoutlook` |`hxmail`| -|`miranda64` |`miranda32`| -|`mpc-hc` |`mplayerc`| -|`mpc-hc64` |`mplayerc`| -|`notepad++` |`notepadPlusPlus`| -|`searchapp` |`searchui`| -|`searchhost` |`searchui`| -|`springtoolsuite4` |`eclipse`| -|`sts` |`eclipse`| -|`teamtalk3` |`teamtalk4classic`| -|`textinputhost` |`windowsinternal_composableshell_experiences_textinput_inputapp`| -|`totalcmd64` |`totalcmd`| -|`win32calc` |`calc`| -|`winmail` |`msimn`| -|`zend-eclipse-php` |`eclipse`| -|`zendstudio` |`eclipse`| + Code which imports from one of them, should instead import from the replacement module. (#15618, @lukaszgo1) + +| Removed module name | Replacement module | +| ---------------------------- | ------------------------------------------------------------------ | +| `azardi-2.0` | `azardi20` | +| `azuredatastudio` | `code` | +| `azuredatastudio-insiders` | `code` | +| `calculatorapp` | `calculator` | +| `code - insiders` | `code` | +| `commsapps` | `hxmail` | +| `dbeaver` | `eclipse` | +| `digitaleditionspreview` | `digitaleditions` | +| `esybraille` | `esysuite` | +| `hxoutlook` | `hxmail` | +| `miranda64` | `miranda32` | +| `mpc-hc` | `mplayerc` | +| `mpc-hc64` | `mplayerc` | +| `notepad++` | `notepadPlusPlus` | +| `searchapp` | `searchui` | +| `searchhost` | `searchui` | +| `springtoolsuite4` | `eclipse` | +| `sts` | `eclipse` | +| `teamtalk3` | `teamtalk4classic` | +| `textinputhost` | `windowsinternal_composableshell_experiences_textinput_inputapp` | +| `totalcmd64` | `totalcmd` | +| `win32calc` | `calc` | +| `winmail` | `msimn` | +| `zend-eclipse-php` | `eclipse` | +| `zendstudio` | `eclipse` | #### Deprecations @@ -1262,7 +1264,7 @@ Code which imports from one of them, should instead import from the replacement * `winVersion.WIN7_SP1` * `winVersion.WIN8` * The `bdDetect.KEY_*` constants have been deprecated. -Use `bdDetect.DeviceType.*` instead. (#15772, @LeonarddeR). + Use `bdDetect.DeviceType.*` instead. (#15772, @LeonarddeR). * The `bdDetect.DETECT_USB` and `bdDetect.DETECT_BLUETOOTH` constants have been deprecated with no public replacement. (#15772, @LeonarddeR). * Using `gui.ExecAndPump` is deprecated - please use `systemUtils.ExecAndPump` instead. (#15852, @lukaszgo1) @@ -1274,7 +1276,7 @@ Please responsibly disclose security issues following NVDA's [security policy](h ### Security Fixes * Prevents loading custom configuration while secure mode is forced. -([GHSA-727q-h8j2-6p45](https://github.com/nvaccess/nvda/security/advisories/GHSA-727q-h8j2-6p45)) + ([GHSA-727q-h8j2-6p45](https://github.com/nvaccess/nvda/security/advisories/GHSA-727q-h8j2-6p45)) ### Bug Fixes @@ -1289,7 +1291,7 @@ Please responsibly disclose security issues following NVDA's [security policy](h ### Security Fixes * Prevents possible reflected XSS attack from crafted content to cause arbitrary code execution. -([GHSA-xg6w-23rw-39r8](https://github.com/nvaccess/nvda/security/advisories/GHSA-xg6w-23rw-39r8)) + ([GHSA-xg6w-23rw-39r8](https://github.com/nvaccess/nvda/security/advisories/GHSA-xg6w-23rw-39r8)) ## 2023.3.2 @@ -1300,8 +1302,8 @@ Please responsibly disclose security issues following NVDA's [security policy](h ### Security Fixes * The security patch in 2023.3.1 was not resolved correctly. -Prevents possible system access and arbitrary code execution with system privileges for unauthenticated users. -([GHSA-h7pp-6jqw-g3pj](https://github.com/nvaccess/nvda/security/advisories/GHSA-h7pp-6jqw-g3pj)) + Prevents possible system access and arbitrary code execution with system privileges for unauthenticated users. + ([GHSA-h7pp-6jqw-g3pj](https://github.com/nvaccess/nvda/security/advisories/GHSA-h7pp-6jqw-g3pj)) ## 2023.3.1 @@ -1311,7 +1313,7 @@ Please responsibly disclose security issues following NVDA's [security policy](h ### Security Fixes * Prevents possible system access and arbitrary code execution with system privileges for unauthenticated users. -([GHSA-h7pp-6jqw-g3pj](https://github.com/nvaccess/nvda/security/advisories/GHSA-h7pp-6jqw-g3pj)) + ([GHSA-h7pp-6jqw-g3pj](https://github.com/nvaccess/nvda/security/advisories/GHSA-h7pp-6jqw-g3pj)) ## 2023.3 @@ -1335,11 +1337,11 @@ There's also been bug fixes for the Add-on Store, Microsoft Office, Microsoft Ed * An option in Audio settings to have the volume of NVDA sounds and beeps follow the volume setting of the voice you are using. (#1409) * An option in Audio settings to separately configure the volume of NVDA sounds. (#1409, #15038) * The settings to change audio output device and toggle audio ducking have been moved to the new Audio settings panel from the Select Synthesizer dialog. - These options will be removed from the "select synthesizer" dialog in 2024.1. (#15486, #8711) + These options will be removed from the "select synthesizer" dialog in 2024.1. (#15486, #8711) * NVDA will now output audio via the Windows Audio Session API (WASAPI), which may improve the responsiveness, performance and stability of NVDA speech and sounds. (#14697, #11169, #11615, #5096, #10185, #11061) * Note: WASAPI is incompatible with some add-ons. - Compatible updates are available for these add-ons, please update them before updating NVDA. - Incompatible versions of these add-ons will be disabled when updating NVDA: + Compatible updates are available for these add-ons, please update them before updating NVDA. + Incompatible versions of these add-ons will be disabled when updating NVDA: * Tony's Enhancements version 1.15 or older. (#15402) * NVDA global commands extension 12.0.8 or older. (#15443) * NVDA is now able to continually update the result when performing optical character recognition (OCR), speaking new text as it appears. (#2797) @@ -1353,7 +1355,7 @@ There's also been bug fixes for the Add-on Store, Microsoft Office, Microsoft Ed * Braille: * When the text in a terminal changes without updating the caret, the text on a braille display will now properly update when positioned on a changed line. - This includes situations where braille is tethered to review. (#15115) + This includes situations where braille is tethered to review. (#15115) * More BRLTTY key bindings are now mapped to NVDA commands (#6483): * `learn`: toggle NVDA input help * `prefmenu`: open the NVDA menu @@ -1363,7 +1365,7 @@ There's also been bug fixes for the Add-on Store, Microsoft Office, Microsoft Ed * `say_below`: Say all using review cursor * The BRLTTY driver is only available when a BRLTTY instance with BrlAPI enabled is running. (#15335) * The advanced setting to enable support for HID braille has been removed in favor of a new option. - You can now disable specific drivers for braille display auto detection in the braille display selection dialog. (#15196) + You can now disable specific drivers for braille display auto detection in the braille display selection dialog. (#15196) * Add-on Store: Installed add-ons will now be listed in the Available Add-ons tab, if they are available in the store. (#15374) * Some shortcut keys have been updated in the NVDA menu. (#15364) @@ -1395,26 +1397,26 @@ There's also been bug fixes for the Add-on Store, Microsoft Office, Microsoft Ed Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process. * `braille.handler.handleUpdate` and `braille.handler.handleReviewMove` have been changed in order not to update instantly. -Before this change, when either of these methods was called very often, this would drain many resources. -These methods now queue an update at the end of every core cycle instead. -They should also be thread safe, making it possible to call them from background threads. (#15163) + Before this change, when either of these methods was called very often, this would drain many resources. + These methods now queue an update at the end of every core cycle instead. + They should also be thread safe, making it possible to call them from background threads. (#15163) * Added official support to register custom braille display drivers in the automatic braille display detection process. -Consult the `braille.BrailleDisplayDriver` class documentation for more details. -Most notably, the `supportsAutomaticDetection` attribute must be set to `True` and the `registerAutomaticDetection` `classmethod` must be implemented. (#15196) + Consult the `braille.BrailleDisplayDriver` class documentation for more details. + Most notably, the `supportsAutomaticDetection` attribute must be set to `True` and the `registerAutomaticDetection` `classmethod` must be implemented. (#15196) #### Deprecations * `braille.BrailleHandler.handlePendingCaretUpdate` is now deprecated with no public replacement. -It will be removed in 2024.1. (#15163) + It will be removed in 2024.1. (#15163) * Importing the constants `xlCenter`, `xlJustify`, `xlLeft`, `xlRight`, `xlDistributed`, `xlBottom`, `xlTop` from `NVDAObjects.window.excel` is deprecated. -Use `XlHAlign` or `XlVAlign` enumerations instead. (#15205) + Use `XlHAlign` or `XlVAlign` enumerations instead. (#15205) * The mapping `NVDAObjects.window.excel.alignmentLabels` is deprecated. -Use the `displayString` methods of `XlHAlign` or `XlVAlign` enumerations instead. (#15205) + Use the `displayString` methods of `XlHAlign` or `XlVAlign` enumerations instead. (#15205) * `bdDetect.addUsbDevices` and `bdDetect.addBluetoothDevices` have been deprecated. -Braille display drivers should implement the `registerAutomaticDetection` classmethod instead. -That method receives a `DriverRegistrar` object on which the `addUsbDevices` and `addBluetoothDevices` methods can be used. (#15200) + Braille display drivers should implement the `registerAutomaticDetection` classmethod instead. + That method receives a `DriverRegistrar` object on which the `addUsbDevices` and `addBluetoothDevices` methods can be used. (#15200) * The default implementation of the check method on `BrailleDisplayDriver` uses `bdDetect.driverHasPossibleDevices` for devices that are marked as thread safe. -Starting from NVDA 2024.1, in order for the base method to use `bdDetect.driverHasPossibleDevices`, the `supportsAutomaticDetection` attribute must be set to `True` as well. (#15200) + Starting from NVDA 2024.1, in order for the base method to use `bdDetect.driverHasPossibleDevices`, the `supportsAutomaticDetection` attribute must be set to `True` as well. (#15200) ## 2023.2 @@ -1451,7 +1453,7 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. * When pressing `numpad2` three times to report the numerical value of the character at the position of the review cursor, the information is now also provided in braille. (#14826) * Added support for the `aria-brailleroledescription` ARIA 1.3 attribute, allowing web authors to override the type of an element shown on the braille display. (#14748) * Baum braille driver: added several braille chord gestures for performing common keyboard commands such as `windows+d` and `alt+tab`. - Please refer to the NVDA User Guide for a full list. (#14714) + Please refer to the NVDA User Guide for a full list. (#14714) * Added pronunciation of Unicode symbols: * braille symbols such as `⠐⠣⠃⠗⠇⠐⠜`. (#13778) * Mac Option key symbol `⌥`. (#14682) @@ -1470,7 +1472,7 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. * Experimental enhanced sound management: * NVDA can now output audio via the Windows Audio Session API (WASAPI), which may improve the responsiveness, performance and stability of NVDA speech and sounds. (#14697) * WASAPI usage can be enabled in Advanced settings. - Additionally, if WASAPI is enabled, the following Advanced settings can also be configured. + Additionally, if WASAPI is enabled, the following Advanced settings can also be configured. * An option to have the volume of NVDA sounds and beeps follow the volume setting of the voice you are using. (#1409) * An option to separately configure the volume of NVDA sounds. (#1409, #15038) * There is a known issue with intermittent crashing when WASAPI is enabled. (#15150) @@ -1491,9 +1493,9 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. * When moving to a different cell in LibreOffice Calc, NVDA no longer incorrectly announces the coordinates of the previously focused cell when cell coordinate announcement is disabled in NVDA's settings. (#15098) * Braille changes: * When using a braille display via the Standard HID braille driver, the dpad can be used to emulate the arrow keys and enter. - Also `space+dot1` and `space+dot4` now map to up and down arrow respectively. (#14713) + Also `space+dot1` and `space+dot4` now map to up and down arrow respectively. (#14713) * Updates to dynamic web content (ARIA live regions) are now displayed in braille. - This can be disabled in the Advanced Settings panel. (#7756) + This can be disabled in the Advanced Settings panel. (#7756) * Dash and em-dash symbols will always be sent to the synthesizer. (#13830) * Distance reported in Microsoft Word will now honour the unit defined in Word's advanced options even when using UIA to access Word documents. (#14542) * NVDA responds faster when moving the cursor in edit controls. (#14708) @@ -1508,7 +1510,7 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. * Several stability fixes to input/output for braille displays, resulting in less frequent errors and crashes of NVDA. (#14627) * NVDA will no longer unnecessarily switch to no braille multiple times during auto detection, resulting in a cleaner log and less overhead. (#14524) * NVDA will now switch back to USB if a HID Bluetooth device (such as the HumanWare Brailliant or APH Mantis) is automatically detected and an USB connection becomes available. - This only worked for Bluetooth Serial ports before. (#14524) + This only worked for Bluetooth Serial ports before. (#14524) * When no braille display is connected and the braille viewer is closed by pressing `alt+f4` or clicking the close button, the display size of the braille subsystem will again be reset to no cells. (#15214) * Web browsers: * NVDA no longer occasionally causes Mozilla Firefox to crash or stop responding. (#14647) @@ -1517,7 +1519,7 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. * In Mozilla Firefox, moving the mouse over text after a link now reliably reports the text. (#9235) * The destination of graphic links is now reported accurately in more cases in Chrome and Edge. (#14783) * When trying to report the URL for a link without a href attribute NVDA is no longer silent. - Instead NVDA reports that the link has no destination. (#14723) + Instead NVDA reports that the link has no destination. (#14723) * In Browse mode, NVDA will no longer incorrectly ignore focus moving to a parent or child control e.g. moving from a control to its parent list item or gridcell. (#14611) * Note however that this fix only applies when the Automatically set focus to focusable elements" option in Browse Mode settings is turned off (which is the default). * Fixes for Windows 11: @@ -1546,21 +1548,21 @@ eSpeak-NG, LibLouis braille translator, and Unicode CLDR have been updated. Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/documentation/developerGuide.html#API) for information on NVDA's API deprecation and removal process. * Suggested conventions have been added to the add-on manifest specification. -These are optional for NVDA compatibility, but are encouraged or required for submitting to the Add-on Store. (#14754) + These are optional for NVDA compatibility, but are encouraged or required for submitting to the Add-on Store. (#14754) * Use `lowerCamelCase` for the name field. * Use `..` format for the version field (required for add-on datastore). * Use `https://` as the schema for the url field (required for add-on datastore). * Added a new extension point type called `Chain`, which can be used to iterate over iterables returned by registered handlers. (#14531) * Added the `bdDetect.scanForDevices` extension point. -Handlers can be registered that yield `BrailleDisplayDriver/DeviceMatch` pairs that don't fit in existing categories, like USB or Bluetooth. (#14531) + Handlers can be registered that yield `BrailleDisplayDriver/DeviceMatch` pairs that don't fit in existing categories, like USB or Bluetooth. (#14531) * Added extension point: `synthDriverHandler.synthChanged`. (#14618) * The NVDA Synth Settings Ring now caches available setting values the first time they're needed, rather than when loading the synthesizer. (#14704) * You can now call the export method on a gesture map to export it to a dictionary. -This dictionary can be imported in another gesture by passing it either to the constructor of `GlobalGestureMap` or to the update method on an existing map. (#14582) + This dictionary can be imported in another gesture by passing it either to the constructor of `GlobalGestureMap` or to the update method on an existing map. (#14582) * `hwIo.base.IoBase` and its derivatives now have a new constructor parameter to take a `hwIo.ioThread.IoThread`. -If not provided, the default thread is used. (#14627) + If not provided, the default thread is used. (#14627) * `hwIo.ioThread.IoThread` now has a `setWaitableTimer` method to set a waitable timer using a python function. -Similarly, the new `getCompletionRoutine` method allows you to convert a python method into a completion routine safely. (#14627) + Similarly, the new `getCompletionRoutine` method allows you to convert a python method into a completion routine safely. (#14627) * `offsets.OffsetsTextInfo._get_boundingRects` should now always return `List[locationHelper.rectLTWH]` as expected for a subclass of `textInfos.TextInfo`. (#12424) * `highlight-color` is now a format field attribute. (#14610) * NVDA should more accurately determine if a logged message is coming from NVDA core. (#14812) @@ -1572,18 +1574,18 @@ Similarly, the new `getCompletionRoutine` method allows you to convert a python #### Deprecations * Passing lambda functions to `hwIo.ioThread.IoThread.queueAsApc` is deprecated. -Instead, functions should be weakly referenceable. (#14627) + Instead, functions should be weakly referenceable. (#14627) * Importing `LPOVERLAPPED_COMPLETION_ROUTINE` from `hwIo.base` is deprecated. -Instead import from `hwIo.ioThread`. (#14627) + Instead import from `hwIo.ioThread`. (#14627) * `IoThread.autoDeleteApcReference` is deprecated. -It was introduced in NVDA 2023.1 and was never meant to be part of the public API. -Until removal, it behaves as a no-op, i.e. a context manager yielding nothing. (#14924) + It was introduced in NVDA 2023.1 and was never meant to be part of the public API. + Until removal, it behaves as a no-op, i.e. a context manager yielding nothing. (#14924) * `gui.MainFrame.onAddonsManagerCommand` is deprecated, use `gui.MainFrame.onAddonStoreCommand` instead. (#13985) * `speechDictHandler.speechDictVars.speechDictsPath` is deprecated, use `NVDAState.WritePaths.speechDictsDir` instead. (#15021) * Importing `voiceDictsPath` and `voiceDictsBackupPath` from `speechDictHandler.dictFormatUpgrade` is deprecated. -Instead use `WritePaths.voiceDictsDir` and `WritePaths.voiceDictsBackupDir` from `NVDAState`. (#15048) + Instead use `WritePaths.voiceDictsDir` and `WritePaths.voiceDictsBackupDir` from `NVDAState`. (#15048) * `config.CONFIG_IN_LOCAL_APPDATA_SUBKEY` is deprecated. -Instead use `config.RegistryKey.CONFIG_IN_LOCAL_APPDATA_SUBKEY`. (#15049) + Instead use `config.RegistryKey.CONFIG_IN_LOCAL_APPDATA_SUBKEY`. (#15049) ## 2023.1 @@ -1612,24 +1614,24 @@ Note: * Microsoft Excel via UI Automation: Automatic reporting of column and row headers in tables. (#14228) * Note: This is referring to tables formatted via the "Table" button on the Insert pane of the Ribbon. - "First Column" and "Header Row" in "Table Style Options" correspond to column and row headers respectively. + "First Column" and "Header Row" in "Table Style Options" correspond to column and row headers respectively. * This is not referring to screen reader specific headers via named ranges, which is currently not supported via UI Automation. * An unassigned script has been added to toggle delayed character descriptions. (#14267) * Added an experimental option to leverage the UIA notification support in Windows Terminal to report new or changed text in the terminal, resulting in improved stability and responsivity. (#13781) * Consult the user guide for limitations of this experimental option. * On Windows 11 ARM64, browse mode is now available in AMD64 apps such as Firefox, Google Chrome and 1Password. (#14397) * A new option has been added, "Paragraph Style" in "Document Navigation". -This adds support for single line break (normal) and multi line break (block) paragraph navigation. -This can be used with text editors that do not support paragraph navigation natively, such as Notepad and Notepad++. (#13797) + This adds support for single line break (normal) and multi line break (block) paragraph navigation. + This can be used with text editors that do not support paragraph navigation natively, such as Notepad and Notepad++. (#13797) * The presence of multiple annotations are now reported. -`NVDA+d` now cycles through reporting the summary of each annotation target for origins with multiple annotation targets. -For example, when text has a comment and a footnote associated with it. (#14507, #14480) + `NVDA+d` now cycles through reporting the summary of each annotation target for origins with multiple annotation targets. + For example, when text has a comment and a footnote associated with it. (#14507, #14480) * Added support for Tivomatic Caiku Albatross 46/80 braille displays. (#13045) * New global command: Report link destination (`NVDA+k`). -Pressed once will speak/braille the destination of the link that is in the navigator object. -Pressing twice will show it in a window, for more detailed review. (#14583) + Pressed once will speak/braille the destination of the link that is in the navigator object. + Pressing twice will show it in a window, for more detailed review. (#14583) * New unmapped global command (Tools category): Report link destination in a window. -Same as pressing `NVDA+k` twice, but may be more useful for braille users. (#14583) + Same as pressing `NVDA+k` twice, but may be more useful for braille users. (#14583) ### Changes @@ -1688,7 +1690,7 @@ Please refer to [the developer guide](https://www.nvaccess.org/files/nvda/docume * System tests should now pass when run locally on non-English systems. (#13362) * In Windows 11 on ARM, x64 apps are no longer identified as ARM64 applications. (#14403) * It is no longer necessary to use `SearchField` and `SuggestionListItem` `UIA` `NVDAObjects` in new UI Automation scenarios, where automatic reporting of search suggestions, and where typing has been exposed via UI Automation with the `controllerFor` pattern. -This functionality is now available generically via `behaviours.EditableText` and the base `NVDAObject` respectively. (#14222) + This functionality is now available generically via `behaviours.EditableText` and the base `NVDAObject` respectively. (#14222) * The UIA debug logging category when enabled now produces significantly more logging for UIA event handlers and utilities. (#14256) * NVDAHelper build standards updated. (#13072) * Now uses the C++20 standard, was C++17. @@ -1727,7 +1729,7 @@ Please open a GitHub issue if your Add-on has an issue with updating to the new * `autoTether` has been removed; `tetherTo` can now take the value "auto" instead. * In `[keyboard]` section (#14528): * `useCapsLockAsNVDAModifierKey`, `useNumpadInsertAsNVDAModifierKey`, `useExtendedInsertAsNVDAModifierKey` have been removed. - They are replaced by `NVDAModifierKeys`. + They are replaced by `NVDAModifierKeys`. * The `NVDAHelper.RemoteLoader64` class has been removed with no replacement. (#14449) * The following functions in `winAPI.sessionTracking` are removed with no replacement. (#14416, #14490) * `isWindowsLocked` @@ -1736,13 +1738,13 @@ Please open a GitHub issue if your Add-on has an issue with updating to the new * `register` * `isLockStateSuccessfullyTracked` * It is no longer possible to enable/disable the braille handler by setting `braille.handler.enabled`. -To disable the braille handler programatically, register a handler to `braille.handler.decide_enabled`. (#14503) + To disable the braille handler programatically, register a handler to `braille.handler.decide_enabled`. (#14503) * It is no longer possible to update the display size of the handler by setting `braille.handler.displaySize`. -To update the displaySize programatically, register a handler to `braille.handler.filter_displaySize`. -Refer to `brailleViewer` for an example on how to do this. (#14503) + To update the displaySize programatically, register a handler to `braille.handler.filter_displaySize`. + Refer to `brailleViewer` for an example on how to do this. (#14503) * There have been changes to the usage of `addonHandler.Addon.loadModule`. (#14481) * `loadModule` now expects dot as a separator, rather than backslash. - For example "lib.example" instead of "lib\example". + For example "lib.example" instead of "lib\example". * `loadModule` now raises an exception when a module can't be loaded or has errors, instead of silently returning `None` without giving information about the cause. * The following symbols have been removed from `appModules.foobar2000` with no direct replacement. (#14570) * `statusBarTimes` @@ -1750,7 +1752,7 @@ Refer to `brailleViewer` for an example on how to do this. (#14503) * `getOutputFormat` * `getParsingFormat` * The following are no longer singletons - their get method has been removed. -Usage of `Example.get()` is now `Example()`. (#14248) + Usage of `Example.get()` is now `Example()`. (#14248) * `UIAHandler.customAnnotations.CustomAnnotationTypesCommon` * `UIAHandler.customProps.CustomPropertiesCommon` * `NVDAObjects.UIA.excel.ExcelCustomProperties` @@ -1760,17 +1762,17 @@ Usage of `Example.get()` is now `Example()`. (#14248) * `NVDAObjects.UIA.winConsoleUIA.WinTerminalUIA` is deprecated and usage is discouraged. (#14047) * `config.addConfigDirsToPythonPackagePath` has been moved. -Use `addonHandler.packaging.addDirsToPythonPackagePath` instead. (#14350) + Use `addonHandler.packaging.addDirsToPythonPackagePath` instead. (#14350) * `braille.BrailleHandler.TETHER_*` are deprecated. -Use `configFlags.TetherTo.*.value` instead. (#14233) + Use `configFlags.TetherTo.*.value` instead. (#14233) * `utils.security.postSessionLockStateChanged` is deprecated. -Use `utils.security.post_sessionLockStateChanged` instead. (#14486) + Use `utils.security.post_sessionLockStateChanged` instead. (#14486) * `NVDAObject.hasDetails`, `NVDAObject.detailsSummary`, `NVDAObject.detailsRole` has been deprecated. -Use `NVDAObject.annotations` instead. (#14507) + Use `NVDAObject.annotations` instead. (#14507) * `keyboardHandler.SUPPORTED_NVDA_MODIFIER_KEYS` is deprecated with no direct replacement. -Consider using the class `config.configFlags.NVDAKey` instead. (#14528) + Consider using the class `config.configFlags.NVDAKey` instead. (#14528) * `gui.MainFrame.evaluateUpdatePendingUpdateMenuItemCommand` has been deprecated. -Use `gui.MainFrame.SysTrayIcon.evaluateUpdatePendingUpdateMenuItemCommand` instead. (#14523) + Use `gui.MainFrame.SysTrayIcon.evaluateUpdatePendingUpdateMenuItemCommand` instead. (#14523) ## 2022.4 @@ -1828,10 +1830,10 @@ There are new Chinese, Swedish, Luganda and Kinyarwanda braille tables. * Selective registration for UI Automation events and property changes now enabled by default. * Text reporting, Braille output, and password suppression now work as expected in the embedded Windows Terminal control in Visual Studio 2022. (#14194) * NVDA is now DPI aware when using multiple monitors. -There are several fixes for using a DPI setting higher than 100% or multiple monitors. -Issues may still exist with versions of Windows older than Windows 10 1809. -For these fixes to work, applications which NVDA interacts with also need to be DPI aware. -Note there are still known issues with Chrome and Edge. (#13254) + There are several fixes for using a DPI setting higher than 100% or multiple monitors. + Issues may still exist with versions of Windows older than Windows 10 1809. + For these fixes to work, applications which NVDA interacts with also need to be DPI aware. + Note there are still known issues with Chrome and Edge. (#13254) * Visual highlighting frames should now be correctly placed in most applications. (#13370, #3875, #12070) * Touch screen interaction should now be accurate for most applications. (#7083) * Mouse tracking should now work for most applications. (#6722) @@ -1862,7 +1864,7 @@ This also addresses a security issue. ### Security Fixes * Prevents possible system access (e.g. NVDA Python console) for unauthenticated users. -([GHSA-fpwc-2gxx-j9v7](https://github.com/nvaccess/nvda/security/advisories/GHSA-fpwc-2gxx-j9v7)) + ([GHSA-fpwc-2gxx-j9v7](https://github.com/nvaccess/nvda/security/advisories/GHSA-fpwc-2gxx-j9v7)) ### Bug Fixes @@ -1890,7 +1892,7 @@ This is a minor release to fix regressions with 2022.3.1 and address a security ### Security Fixes * Prevents possible system level access for unauthenticated users. -([GHSA-3jj9-295f-h69w](https://github.com/nvaccess/nvda/security/advisories/GHSA-3jj9-295f-h69w)) + ([GHSA-3jj9-295f-h69w](https://github.com/nvaccess/nvda/security/advisories/GHSA-3jj9-295f-h69w)) ### Bug Fixes @@ -1905,11 +1907,11 @@ Please responsibly disclose security issues to . ### Security Fixes * Fixed exploit where it was possible to elevate from user to system privileges. -([GHSA-q7c2-pgqm-vvw5](https://github.com/nvaccess/nvda/security/advisories/GHSA-q7c2-pgqm-vvw5)) + ([GHSA-q7c2-pgqm-vvw5](https://github.com/nvaccess/nvda/security/advisories/GHSA-q7c2-pgqm-vvw5)) * Fixed a security issue allowing access to the python console on the lock screen via a race condition for NVDA startup. -([GHSA-72mj-mqhj-qh4w](https://github.com/nvaccess/nvda/security/advisories/GHSA-72mj-mqhj-qh4w)) + ([GHSA-72mj-mqhj-qh4w](https://github.com/nvaccess/nvda/security/advisories/GHSA-72mj-mqhj-qh4w)) * Fixed issue where speech viewer text is cached when locking Windows. -([GHSA-grvr-j2h8-3qm4](https://github.com/nvaccess/nvda/security/advisories/GHSA-grvr-j2h8-3qm4)) + ([GHSA-grvr-j2h8-3qm4](https://github.com/nvaccess/nvda/security/advisories/GHSA-grvr-j2h8-3qm4)) ### Bug Fixes @@ -1931,7 +1933,7 @@ eSpeak has been updated, which introduces 3 new languages: Belarusian, Luxembour * Vastly improved performance and stability. (#10964) * When pressing `control+f` to find text, the review cursor position is updated to follow the found term. (#11172) * Reporting of typed text that does not appear on-screen (such as passwords) is disabled by default. -It can be re-enabled in NVDA's advanced settings panel. (#11554) + It can be re-enabled in NVDA's advanced settings panel. (#11554) * Text that has scrolled offscreen can be reviewed without scrolling the console window. (#12669) * More detailed text formatting information is available. ([microsoft/terminal PR 10336](https://github.com/microsoft/terminal/pull/10336)) * A new Speech option has been added to read character descriptions after a delay. (#13509) @@ -1954,7 +1956,7 @@ It can be re-enabled in NVDA's advanced settings panel. (#11554) * Note that the most up to date version of Adobe Acrobat / Reader is also required to avoid the crash. * Font size measurements are now translatable in NVDA. (#13573) * Ignore Java Access Bridge events where no window handle can be found for Java applications. -This will improve performance for some Java applications including IntelliJ IDEA. (#13039) + This will improve performance for some Java applications including IntelliJ IDEA. (#13039) * Announcement of selected cells for LibreOffice Calc is more efficient and no longer results in a Calc freeze when many cells are selected. (#13232) * When running under a different user, Microsoft Edge is no longer inaccessible. (#13032) * When rate boost is off, eSpeak's rate does not drop anymore between rates 99% and 100%. (#13876) @@ -1980,7 +1982,7 @@ This is a patch release to fix a security issue. ### Bug Fixes * Fixed an exploit where it was possible to open the NVDA python console via the log viewer on the lock screen. -([GHSA-585m-rpvv-93qg](https://github.com/nvaccess/nvda/security/advisories/GHSA-585m-rpvv-93qg)) + ([GHSA-585m-rpvv-93qg](https://github.com/nvaccess/nvda/security/advisories/GHSA-585m-rpvv-93qg)) ## 2022.2.3 @@ -1989,7 +1991,7 @@ This is a patch release to fix an accidental API breakage introduced in 2022.2.1 ### Bug Fixes * Fixed a bug where NVDA did not announce "Secure Desktop" when entering a secure desktop. -This caused NVDA remote to not recognize secure desktops. (#14094) + This caused NVDA remote to not recognize secure desktops. (#14094) ## 2022.2.2 @@ -2065,7 +2067,7 @@ LibLouis has been updated, which includes a new German braille table. * Braille fixes: * Fix braille output when navigating certain text in Mozilla rich edit controls, such as drafting a message in Thunderbird. (#12542) * When braille is tethered automatically and the mouse is moved with mouse tracking enabled, - text review commands now update the braille display with the spoken content. (#11519) + text review commands now update the braille display with the spoken content. (#11519) * It is now possible to pan the braille display through content after use of text review commands. (#8682) * The NVDA installer can now run from directories with special characters. (#13270) * In Firefox, NVDA no longer fails to report items in web pages when aria-rowindex, aria-colindex, aria-rowcount or aria-colcount attributes are invalid. (#13405) @@ -2078,9 +2080,9 @@ LibLouis has been updated, which includes a new German braille table. * Visual Studio now correctly reports line indentation. (#13574) * NVDA will once again announce Start menu search result details in recent Windows 10 and 11 releases. (#13544) * In Windows 10 and 11 Calculator version 10.1908 and later, -NVDA will announce results when more commands are pressed, such as commands from scientific mode. (#13383) + NVDA will announce results when more commands are pressed, such as commands from scientific mode. (#13383) * In Windows 11, it is again possible to navigate and interact with user interface elements, -such as Taskbar and Task View using mouse and touch interaction. (#13506) + such as Taskbar and Task View using mouse and touch interaction. (#13506) * NVDA will announce status bar content in Windows 11 Notepad. (#13688) * Navigator object highlighting now shows up immediately upon activation of the feature. (#13641) * Fix reading single column list view items. (#13659, #13735) @@ -2090,11 +2092,11 @@ such as Taskbar and Task View using mouse and touch interaction. (#13506) ### Changes for Developers * Compiling NVDA dependencies with Visual Studio 2022 (17.0) is now supported. -For development and release builds, Visual Studio 2019 is still used. (#13033) + For development and release builds, Visual Studio 2019 is still used. (#13033) * When retrieving the count of selected children via accSelection, -the case where a negative child ID or an IDispatch is returned by `IAccessible::get_accSelection` is now handled properly. (#13277) + the case where a negative child ID or an IDispatch is returned by `IAccessible::get_accSelection` is now handled properly. (#13277) * New convenience functions `registerExecutableWithAppModule` and `unregisterExecutable` were added to the `appModuleHandler` module. -They can be used to use a single App Module with multiple executables. (#13366) + They can be used to use a single App Module with multiple executables. (#13366) #### Deprecations @@ -2107,36 +2109,36 @@ For add-on authors, please open a GitHub issue if these changes stop the API fro * `appModuleHandler.NVDAProcessID` is deprecated, use `globalVars.appPid` instead. (#13646) * `gui.quit` is deprecated, use `wx.CallAfter(mainFrame.onExitCommand, None)` instead. (#13498) - - + -------------------------------------- * Some alias appModules are marked as deprecated. -Code which imports from one of them, should instead import from the replacement module. (#13366) - -| Removed module name |Replacement module| -|---|---| -|azuredatastudio |code| -|azuredatastudio-insiders |code| -|calculatorapp |calculator| -|code - insiders |code| -|commsapps |hxmail| -|dbeaver |eclipse| -|digitaleditionspreview |digitaleditions| -|esybraille |esysuite| -|hxoutlook |hxmail| -|miranda64 |miranda32| -|mpc-hc |mplayerc| -|mpc-hc64 |mplayerc| -|notepad++ |notepadPlusPlus| -|searchapp |searchui| -|searchhost |searchui| -|springtoolsuite4 |eclipse| -|sts |eclipse| -|teamtalk3 |teamtalk4classic| -|textinputhost |windowsinternal_composableshell_experiences_textinput_inputapp| -|totalcmd64 |totalcmd| -|win32calc |calc| -|winmail |msimn| -|zend-eclipse-php |eclipse| -|zendstudio |eclipse| + Code which imports from one of them, should instead import from the replacement module. (#13366) + +| Removed module name | Replacement module | +| ------------------------ | -------------------------------------------------------------- | +| azuredatastudio | code | +| azuredatastudio-insiders | code | +| calculatorapp | calculator | +| code - insiders | code | +| commsapps | hxmail | +| dbeaver | eclipse | +| digitaleditionspreview | digitaleditions | +| esybraille | esysuite | +| hxoutlook | hxmail | +| miranda64 | miranda32 | +| mpc-hc | mplayerc | +| mpc-hc64 | mplayerc | +| notepad++ | notepadPlusPlus | +| searchapp | searchui | +| searchhost | searchui | +| springtoolsuite4 | eclipse | +| sts | eclipse | +| teamtalk3 | teamtalk4classic | +| textinputhost | windowsinternal_composableshell_experiences_textinput_inputapp | +| totalcmd64 | totalcmd | +| win32calc | calc | +| winmail | msimn | +| zend-eclipse-php | eclipse | +| zendstudio | eclipse | ## 2022.1 @@ -2243,8 +2245,8 @@ Note: * `core.CallCancelled` is now `exceptions.CallCancelled`. (#12940) * All constants starting with RPC from `core` and `logHandler` are moved into `RPCConstants.RPC` enum. (#12940) * It is recommended that `mouseHandler.doPrimaryClick` and `mouseHandler.doSecondaryClick` functions should be used to click the mouse to perform a logical action such as activating (primary) or secondary (show context menu), -rather than using `executeMouseEvent` and specifying the left or right mouse button specifically. -This ensures code will honor the Windows user setting for swapping the primary mouse button. (#12642) + rather than using `executeMouseEvent` and specifying the left or right mouse button specifically. + This ensures code will honor the Windows user setting for swapping the primary mouse button. (#12642) * `config.getSystemConfigPath` has been removed - there is no replacement. (#12943) * `shlobj.SHGetFolderPath` has been removed - please use `shlobj.SHGetKnownFolderPath` instead. (#12943) * `shlobj` constants have been removed. A new enum has been created, `shlobj.FolderId` for usage with `SHGetKnownFolderPath`. (#12943) @@ -2267,13 +2269,13 @@ This ensures code will honor the Windows user setting for swapping the primary m * Switched from Minhook to Microsoft Detours as a hooking library for NVDA. Hooking with this library is mainly used to aid the display model. (#12964) * `winVersion.WIN10_RELEASE_NAME_TO_BUILDS` is removed. (#13211) * SCons now warns to build with a number of jobs that is equal to the number of logical processors in the system. -This can dramatically decrease build times on multi core systems. (#13226, #13371) + This can dramatically decrease build times on multi core systems. (#13226, #13371) * `characterProcessing.SYMLVL_*` constants are removed - please use `characterProcessing.SymbolLevel.*` instead. (#13248) * Functions `loadState` and `saveState` are removed from addonHandler - please use `addonHandler.state.load` and `addonHandler.state.save` instead. (#13245) * Moved the UWP/OneCore interaction layer of NVDAHelper [from C++/CX to C++/Winrt](https://docs.microsoft.com/en-us/windows/uwp/cpp-and-winrt-apis/move-to-winrt-from-cx). (#10662) * It is now mandatory to subclass `DictionaryDialog` to use it. (#13268) * `config.RUN_REGKEY`, `config.NVDA_REGKEY` are deprecated, please use `config.RegistryKey.RUN`, `config.RegistryKey.NVDA` instead. These will be removed in 2023. (#13242) -* `easeOfAccess.ROOT_KEY`, `easeOfAccess.APP_KEY_PATH` are deprecated, please use`easeOfAccess.RegistryKey.ROOT`, `easeOfAccess.RegistryKey.APP` instead. These will be removed in 2023. (#13242) +* `easeOfAccess.ROOT_KEY`, `easeOfAccess.APP_KEY_PATH` are deprecated, please use `easeOfAccess.RegistryKey.ROOT`, `easeOfAccess.RegistryKey.APP` instead. These will be removed in 2023. (#13242) * `easeOfAccess.APP_KEY_NAME` has been deprecated, to be removed in 2023. (#13242) * `DictionaryDialog` and `DictionaryEntryDialog` are moved from `gui.settingsDialogs` to `gui.speechDict`. (#13294) * IAccessible2 relations are now shown in developer info for IAccessible2 objects. (#13315) @@ -2382,7 +2384,7 @@ Affected users will need to download this update manually. * NVDA will default to eSpeak if no installed OneCore voices support the NVDA preferred language. (#10451) * If OneCore voices consistently fail to speak, revert to eSpeak as a synthesizer. (#11544) * When reading status bar with `NVDA+end`, the review cursor is no longer moved to its location. -If you need this functionality please assign a gesture to the appropriate script in the Object Navigation category in the Input Gestures dialog. (#8600) + If you need this functionality please assign a gesture to the appropriate script in the Object Navigation category in the Input Gestures dialog. (#8600) * When opening a settings dialog which is already open, NVDA sets focus on the existing dialog rather than raise an error. (#5383) * Updated liblouis braille translator to [3.19.0](https://github.com/liblouis/liblouis/releases/tag/v3.19.0). (#12810) * New braille tables: Russian grade 1, Tshivenda grade 1, Tshivenda grade 2 @@ -2422,12 +2424,12 @@ If you need this functionality please assign a gesture to the appropriate script ### Changes for Developers * Building NVDA now requires Visual Studio 2019 16.10.4 or later. -To match the production build environment, update Visual Studio to keep in sync with the [current version AppVeyor is using](https://www.appveyor.com/docs/windows-images-software/#visual-studio-2019). (#12728) + To match the production build environment, update Visual Studio to keep in sync with the [current version AppVeyor is using](https://www.appveyor.com/docs/windows-images-software/#visual-studio-2019). (#12728) * `NVDAObjects.UIA.winConsoleUIA.WinConsoleUIA.isImprovedTextRangeAvailable` has been deprecated for removal in 2022.1. (#12660) * Instead use `apiLevel` (see the comments at `_UIAConstants.WinConsoleAPILevel` for details). * Transparency of text background color sourced from GDI applications (via the display model), is now exposed for add-ons or appModules. (#12658) * `LOCALE_SLANGUAGE`, `LOCALE_SLIST` and `LOCALE_SLANGDISPLAYNAME` are moved to the `LOCALE` enum in languageHandler. -They are still available at the module level but are deprecated and to be removed in NVDA 2022.1. (#12753) + They are still available at the module level but are deprecated and to be removed in NVDA 2022.1. (#12753) * The usage of functions `addonHandler.loadState` and `addonHandler.saveState` should be replaced with their equivalents `addonHandler.state.save` and `addonHandler.state.load` before 2022.1. (#12792) * Braille output can now be checked in system tests. (#12917) @@ -2899,7 +2901,7 @@ Highlights of this release include support for several new braille displays from * speakTextInfo now relies on getTextInfoSpeech * speakWithoutPauses has been converted into a class, and refactored, but should not break compatibility. * getSpeechForSpelling is deprecated (though still available) use getSpellingSpeech instead. - Private changes that should not affect addon developers: + Private changes that should not affect addon developers: * _speakPlaceholderIfEmpty is now _getPlaceholderSpeechIfTextEmpty * _speakTextInfo_addMath is now _extendSpeechSequence_addMathForTextInfo * Speech 'reason' has been converted to an Enum, see controlTypes.OutputReason class. (#10703) @@ -3915,7 +3917,7 @@ Highlights of this release include the ability to indicate spelling errors while * NVDA's C++ components are now built with Microsoft Visual Studio 2015. (#5592) * You can now present a text or HTML message to the user in browse mode using ui.browseableMessage. (#4908) -* In the User Guide, when a +* In the User Guide, when a `` ## 2016.1 @@ -4024,11 +4026,11 @@ Highlights of this release include performance improvements in Windows 10; inclu * In Windows 8 and later, NVDA now starts a lot earlier when configured to start after logging on to Windows. (#308) * If you enabled this using a previous version of NVDA, you will need to disable it and enable it again in order for the change to take effect. Follow this procedure: 1. Open the General Settings dialog. - 1. Uncheck the Automatically start NVDA after I log on to Windows checkbox. - 1. Press the OK button. - 1. Open the General Settings dialog again. - 1. Check the Automatically start NVDA after I log on to Windows checkbox. - 1. Press the OK button. + 2. Uncheck the Automatically start NVDA after I log on to Windows checkbox. + 3. Press the OK button. + 4. Open the General Settings dialog again. + 5. Check the Automatically start NVDA after I log on to Windows checkbox. + 6. Press the OK button. * Performance enhancements for UI Automation including File Explorer and Task Viewer. (#5293) * NVDA now correctly switches to focus mode when tabbing to read-only ARIA grid controls in Browse Mode for Mozilla Firefox and other Gecko-based controls. (#5118) * NVDA now correctly reports "no previous" instead of "no next" when there are no more objects when flicking left on a touch screen. @@ -4259,8 +4261,10 @@ Highlights of this release include browse mode for documents in Microsoft Word a * The set column header (NVDA+shift+c) and set row header (NVDA+shift+r) commands now store the settings in the worksheet so that they are available the next time the sheet is opened, and will be available to other screen readers that support the defined name range scheme. * These commands can also now be used multiple times per sheet to set different headers for different regions. * Support for automatic column and row header reading in Microsoft Word (#3110) including: - * Support of Microsoft Word bookmarks to identify header cells (compatible with Jaws screen reader). - - set column header (NVDA+shift+c) and set row header (NVDA+shift+r) commands while on the first header cell in a table allow you to tell NVDA that these headers should be reported automatically. Settings are stored in the document so that they are available the next time the document is opened, and will be available to other screen readers that support the bookmark scheme. +* Support of Microsoft Word bookmarks to identify header cells (compatible with Jaws screen reader). + +- set column header (NVDA+shift+c) and set row header (NVDA+shift+r) commands while on the first header cell in a table allow you to tell NVDA that these headers should be reported automatically. Settings are stored in the document so that they are available the next time the document is opened, and will be available to other screen readers that support the bookmark scheme. + * Microsoft Word: Report the distance from the left edge of the page when the tab key is pressed. (#1353) * Microsoft Word: provide feedback in speech and braille for most available formatting shortcut keys (bold, italic, underline, alignment, outline level, superscript, subscript and font size). (#1353) * Microsoft Excel: If the selected cell contains comments, they can be now reported by pressing NVDA+alt+c. (#2920) @@ -4568,12 +4572,12 @@ The new layout uses the arrow keys in combination with the NVDA key and other mo Please note the following changes to commonly used commands: -| Name |Key| -|---|---| -|Say all |NVDA+a| -|Read current line |NVDA+l| -|Read current text selection |NVDA+shift+s| -|Report status bar |NVDA+shift+end| +| Name | Key | +| --------------------------- | -------------- | +| Say all | NVDA+a | +| Read current line | NVDA+l | +| Read current text selection | NVDA+shift+s | +| Report status bar | NVDA+shift+end | In addition, among other changes, all of the object navigation, text review, mouse click and synth settings ring commands have changed. Please see the [Commands Quick Reference](keyCommands.html) document for the new keys. From b40d7095669f18150587aaae88a5636302f8bb73 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 30 Sep 2025 06:56:48 +0800 Subject: [PATCH 75/93] revert `Initialize Word Segmenters for Unused Languages:` checkbox and fixup the initialization logic --- source/gui/settingsDialogs.py | 7 ------- source/textUtils/__init__.py | 3 +++ source/textUtils/wordSeg/wordSegStrategy.py | 10 ++++++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index 307251705d0..e96fff8fa1d 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3092,13 +3092,6 @@ class DocumentNavigationPanel(SettingsPanel): def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) - # Translators: This is a label for the initialization for word segmenters for unused languages in the document navigation dialog - initUnusedLangLabel = _("&Initialize Word Segmenters for Unused Languages:") - self.initUnusedLangCheckBox: wx.CheckBox = sHelper.addItem( - wx.CheckBox(self, label=initUnusedLangLabel), - ) - self.bindHelpEvent("initWordSegForUnusedLang", self.initUnusedLangCheckBox) - # Translators: This is a label for the word segmentation standard in the document navigation dialog WordNavigationUnitLabel = _("&Word Segmentation Standard:") self.wordSegCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl( diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 059165533f0..12ef420d7b5 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -570,6 +570,8 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: if wordSegStrategy.ChineseWordSegmentationStrategy._lib: return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) else: + log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.") + wordSegStrategy.ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) @@ -581,6 +583,7 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: if wordSegStrategy.ChineseWordSegmentationStrategy._lib: return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) else: + log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.") return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case _: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 0596a5bf159..7eb882d6f2c 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -157,16 +157,18 @@ class ChineseWordSegmentationStrategy(WordSegmentationStrategy): @classmethod @initializerRegistry - def _initCppJieba(cls): # TODO: make cppjieba alternative + def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative """ Class-level initializer: attempts to load the versioned cppjieba library and set up ctypes signatures. """ import config + from ..segFlag import WordSegFlag - if cls._lib is not None or not ( - config.conf["documentNavigation"]["initWordSegForUnusedLang"] or cls.isUsingRelatedLanguage() - ): + if not forceInit and (cls._lib or ( + not config.conf["documentNavigation"]["wordSegmentationStandard"].calculate() == WordSegFlag.CHINESE \ + and not cls.isUsingRelatedLanguage() + )): return try: from NVDAState import ReadPaths From 653e8087b0d8c87d8c30053804e1a8656db5284b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:58:20 +0000 Subject: [PATCH 76/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 7eb882d6f2c..d1549fe904a 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -165,10 +165,14 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat import config from ..segFlag import WordSegFlag - if not forceInit and (cls._lib or ( - not config.conf["documentNavigation"]["wordSegmentationStandard"].calculate() == WordSegFlag.CHINESE \ - and not cls.isUsingRelatedLanguage() - )): + if not forceInit and ( + cls._lib + or ( + not config.conf["documentNavigation"]["wordSegmentationStandard"].calculate() + == WordSegFlag.CHINESE + and not cls.isUsingRelatedLanguage() + ) + ): return try: from NVDAState import ReadPaths From 552b42bd4c51ada9b09d8af26258295625516f49 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 30 Sep 2025 10:33:24 +0800 Subject: [PATCH 77/93] fixup unittests --- source/config/featureFlagEnums.py | 18 ------------------ tests/unit/test_textUtils.py | 25 ++++++------------------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 357a16d2db7..d8d1a88b474 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -138,24 +138,6 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: FontFormattingBrailleModeFlag.TAGS: _("Tags"), } - -class InitWordSegForUnusedLnagFlag(DisplayStringEnum): - """Boolean flag for whether to initialize the word segmenters for all languages, even if they are not used.""" - - @property - def _displayStringLabels(self): - return { - # Translators: Label for an option in NVDA settings. - self.DISABLED: _("Disabled"), - # Translators: Label for an option in NVDA settings. - self.ENABLED: _("Enabled"), - } - - DEFAULT = enum.auto() - DISABLED = enum.auto() - ENABLED = enum.auto() - - class WordNavigationUnitFlag(DisplayStringEnum): """Enumeration for word navigation.""" diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 0e6501733ab..14e60441875 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -7,8 +7,10 @@ import unittest +import config.featureFlagEnums from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries +from textUtils.segFlag import WordSegFlag FACE_PALM = "\U0001f926" # 🤦 SMILE = "\U0001f60a" # 😊 @@ -449,38 +451,23 @@ class TestWordSegmenter(unittest.TestCase): def test_basicLatin(self): text = "hello world" - segmenter = WordSegmenter(text) + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.UNISCRIBE) self.assertEqual(segmenter.getSegmentForOffset(0), (0, 6)) - self.assertEqual(segmenter.getSegmentForOffset(1), (0, 6)) - self.assertEqual(segmenter.getSegmentForOffset(2), (0, 6)) - self.assertEqual(segmenter.getSegmentForOffset(3), (0, 6)) - self.assertEqual(segmenter.getSegmentForOffset(4), (0, 6)) self.assertEqual(segmenter.getSegmentForOffset(5), (0, 6)) self.assertEqual(segmenter.getSegmentForOffset(6), (6, 11)) - self.assertEqual(segmenter.getSegmentForOffset(7), (6, 11)) - self.assertEqual(segmenter.getSegmentForOffset(8), (6, 11)) - self.assertEqual(segmenter.getSegmentForOffset(9), (6, 11)) - self.assertEqual(segmenter.getSegmentForOffset(10), (6, 11)) self.assertEqual(segmenter.getSegmentForOffset(11), (6, 11)) def test_chinese(self): text = "你好世界" - # ensure that the Chinese segmentation strategy is used - import config from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy - temp = config.conf["documentNavigation"]["initWordSegForUnusedLang"] - # ensure the word segmenters for unused languages are initialized - config.conf["documentNavigation"]["initWordSegForUnusedLang"] = True - ChineseWordSegmentationStrategy._initCppJieba() - segmenter = WordSegmenter(text) + + ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) + segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.CHINESE) self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2)) self.assertEqual(segmenter.getSegmentForOffset(1), (0, 2)) self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4)) self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4)) self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4)) - - # revert the config change - config.conf["documentNavigation"]["initWordSegForUnusedLang"] = temp From 5e0e3fdbfefccb44cea10316620dfa05f1b01bef Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 30 Sep 2025 11:24:03 +0800 Subject: [PATCH 78/93] simplify the logic for 'Auto' option in Word Segmentation Standard settings --- source/textUtils/__init__.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 12ef420d7b5..8b2e0bef32e 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -564,15 +564,11 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.AUTO: - if WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( - self.text, - ) and not WordSegmenter._KANA.search(self.text): - if wordSegStrategy.ChineseWordSegmentationStrategy._lib: + if wordSegStrategy.ChineseWordSegmentationStrategy._lib \ + and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( + self.text, + ) and not WordSegmenter._KANA.search(self.text): return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) - else: - log.debugWarning("Chinese word segmenter is loading. Falling back to Uniscribe.") - wordSegStrategy.ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) - return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: From c3a856234e893589cb5fba5f322b8b7052ba3135 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 03:25:34 +0000 Subject: [PATCH 79/93] Pre-commit auto-fix --- source/config/featureFlagEnums.py | 1 + source/textUtils/__init__.py | 9 ++++++--- tests/unit/test_textUtils.py | 3 --- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index d8d1a88b474..59c78bef409 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -138,6 +138,7 @@ def _displayStringLabels(self) -> dict["FontFormattingBrailleModeFlag", str]: FontFormattingBrailleModeFlag.TAGS: _("Tags"), } + class WordNavigationUnitFlag(DisplayStringEnum): """Enumeration for word navigation.""" diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index 8b2e0bef32e..83e8f739732 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -564,11 +564,14 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.AUTO: - if wordSegStrategy.ChineseWordSegmentationStrategy._lib \ + if ( + wordSegStrategy.ChineseWordSegmentationStrategy._lib and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( self.text, - ) and not WordSegmenter._KANA.search(self.text): - return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) + ) + and not WordSegmenter._KANA.search(self.text) + ): + return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) else: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) else: diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 14e60441875..048c8580e78 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -7,7 +7,6 @@ import unittest -import config.featureFlagEnums from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries from textUtils.segFlag import WordSegFlag @@ -462,8 +461,6 @@ def test_chinese(self): from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy - - ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) segmenter = WordSegmenter(text, wordSegFlag=WordSegFlag.CHINESE) self.assertEqual(segmenter.getSegmentForOffset(0), (0, 2)) From 0940a73f95c5ab71282f9b9dd76d6d58e3a3638b Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 30 Sep 2025 12:59:46 +0800 Subject: [PATCH 80/93] fixup --- source/gui/settingsDialogs.py | 3 --- source/textUtils/wordSeg/wordSegStrategy.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index e96fff8fa1d..d4e83f06e78 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3113,9 +3113,6 @@ def makeSettings(self, settingsSizer: wx.BoxSizer) -> None: self.bindHelpEvent("ParagraphStyle", self.paragraphStyleCombo) def onSave(self): - config.conf["documentNavigation"]["initWordSegForUnusedLang"] = ( - self.initUnusedLangCheckBox.IsChecked() - ) self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index d1549fe904a..a2db01fd4ba 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -168,8 +168,8 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat if not forceInit and ( cls._lib or ( - not config.conf["documentNavigation"]["wordSegmentationStandard"].calculate() - == WordSegFlag.CHINESE + config.conf["documentNavigation"]["wordSegmentationStandard"].calculated() + != config.featureFlagEnums.WordNavigationUnitFlag.CHINESE and not cls.isUsingRelatedLanguage() ) ): From d1373b208c297da942428387b1ff04e846d191c0 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 30 Sep 2025 13:06:54 +0800 Subject: [PATCH 81/93] fixup --- source/textUtils/wordSeg/wordSegUtils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py index e011d99b555..dff909e2945 100644 --- a/source/textUtils/wordSeg/wordSegUtils.py +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -33,7 +33,6 @@ def computedStrToEncodedOffsets(self) -> list[int]: """ strLen = self.strLength encodedLen = self.encodedStringLength - sepCount = len(self.newSepIndex) # validate separator positions (optional but makes bugs obvious) for pos in self.newSepIndex: @@ -61,12 +60,9 @@ def computedStrToEncodedOffsets(self) -> list[int]: return strToEncoded - @property def computedEncodedToStrOffsets(self) -> list[int]: encodedLen = self.encodedStringLength - strLen = self.strLength - sepCount = len(self.newSepIndex) # validate separator positions for pos in self.newSepIndex: @@ -95,7 +91,6 @@ def computedEncodedToStrOffsets(self) -> list[int]: return encodedToStr - @cached_property def encodedStringLength(self) -> int: """Returns the length of the string in its subclass-specific encoded representation.""" @@ -138,6 +133,8 @@ def encodedToStrOffsets( else: resultEnd = self.computedEncodedToStrOffsets[encodedEnd] return (resultStart, resultEnd) + + # Punctuation that should NOT have a separator BEFORE it (no space before these marks) NO_SEP_BEFORE = { # Common Chinese fullwidth punctuation From f98b1b198cd6e52b32077c60ac9d92a626139a81 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 05:10:50 +0000 Subject: [PATCH 82/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index a2db01fd4ba..e33bdad9210 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -163,7 +163,6 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat set up ctypes signatures. """ import config - from ..segFlag import WordSegFlag if not forceInit and ( cls._lib From 80b047290f9ab8e8332728d1448f3f5ed7c051e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 05:12:16 +0000 Subject: [PATCH 83/93] Pre-commit auto-fix --- source/textUtils/wordSeg/wordSegStrategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index a2db01fd4ba..e33bdad9210 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -163,7 +163,6 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat set up ctypes signatures. """ import config - from ..segFlag import WordSegFlag if not forceInit and ( cls._lib From d55d077dae1ebe176d1a5354d8c6ac328b1f5ea1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 21:19:50 +0000 Subject: [PATCH 84/93] Pre-commit auto-fix --- user_docs/en/changes.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 6c02190dc5f..a3d226cc648 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -79,8 +79,6 @@ On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of * In `braille.py`, the `FormattingMarker` class has a new `shouldBeUsed` method, to determine if the formatting marker key should be reported (#7608, @nvdaes) -#### API Breaking Changes - These are breaking API changes. Please open a GitHub issue if your add-on has an issue with updating to the new API. From 20830956b9abead664f369b40c1b10b9c3a324b3 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 25 Oct 2025 08:50:00 +0800 Subject: [PATCH 85/93] fixup --- source/textUtils/wordSeg/wordSegStrategy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index e33bdad9210..f59f57aca0c 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -57,8 +57,9 @@ class WordSegmentationStrategy(ABC): """Abstract base class for word segmentation strategies.""" def __init__(self, text: str, encoding: str | None = None): - self.text = text - self.encoding = encoding + self.text: str = text + self.encoding: str | None = encoding + self.wordEnds: list[int] = [] @abstractmethod def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: # TODO: optimize @@ -85,6 +86,10 @@ def getWordOffsetRange( @classmethod def isUsingRelatedLanguage(cls) -> bool: """Returns True if this strategy is for the current language.""" + + if not hasattr(cls, "_LANGUAGE_PATTERN"): + return False + import languageHandler import braille From d32549f308b02d8c5d06e0176c97b0317c160045 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Sat, 25 Oct 2025 09:03:22 +0800 Subject: [PATCH 86/93] make word segmentation module reinitialized after settings are saved --- source/gui/settingsDialogs.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index fc06c8e3dfc..bca19ec8caf 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3116,6 +3116,19 @@ def onSave(self): self.wordSegCombo.saveCurrentValueToConf() self.paragraphStyleCombo.saveCurrentValueToConf() + def postSave(self): + from textUtils import wordSeg + + log.debug("Reinitializing word segmentation module") + + try: + wordSeg.initialize() + except RuntimeError: + log.warning("Word segmentation module disabled in configuration") + except Exception: + log.error("Error reinitializing word segmentation module", exc_info=True) + import _localCaptioner + def _synthWarningDialog(newSynth: str): gui.messageBox( From b8ace769cbcce6dd3726988d47bc972fe66b8ca8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 01:09:03 +0000 Subject: [PATCH 87/93] Pre-commit auto-fix --- source/gui/settingsDialogs.py | 1 - user_docs/en/changes.md | 2 -- 2 files changed, 3 deletions(-) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index bca19ec8caf..9621ab572fe 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3127,7 +3127,6 @@ def postSave(self): log.warning("Word segmentation module disabled in configuration") except Exception: log.error("Error reinitializing word segmentation module", exc_info=True) - import _localCaptioner def _synthWarningDialog(newSynth: str): diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index a3d226cc648..e008a6b62b9 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -78,8 +78,6 @@ On ARM64 machines with Windows 11, these ARM64EC libraries are loaded instead of * Added [cppjieba](https://github.com/yanyiwu/cppjieba) as a git submodule for Chinese word segmentation. (#18548, @CrazySteve0605) * In `braille.py`, the `FormattingMarker` class has a new `shouldBeUsed` method, to determine if the formatting marker key should be reported (#7608, @nvdaes) - -These are breaking API changes. Please open a GitHub issue if your add-on has an issue with updating to the new API. * NVDA is now built with Python 3.13. (#18591) From d2714a369e3e9a8282e350fc732cc813c3301938 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 27 Oct 2025 11:28:27 +0800 Subject: [PATCH 88/93] fixup --- source/braille.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/braille.py b/source/braille.py index 9c656b89600..0d5f05d0e3d 100644 --- a/source/braille.py +++ b/source/braille.py @@ -576,7 +576,7 @@ def update(self): textToTranslate = self.rawText textToTranslateTypeforms = self.rawTextTypeforms cursorPos = self.cursorPos - if config.conf["braille"]["translationTable"].startswith("zh"): + if config.conf["braille"]["translationTable"].startswith("zh") or config.conf["braille"]["translationTable"] == "auto" and brailleTables.getDefaultTableForCurLang(brailleTables.TableType.OUTPUT).startswith("zh"): from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401 converter = WordSegWithSeparatorOffsetConverter(textToTranslate) From db90fff0dd7447f3876a615e6135aaeec183e040 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Tue, 28 Oct 2025 12:57:06 +0800 Subject: [PATCH 89/93] remove duplicate importing lines --- source/textInfos/offsets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index f663dd2daf6..32e18fc41bf 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -169,8 +169,6 @@ def wordSegFlag(self) -> WordSegFlag | None: case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: return WordSegFlag.CHINESE case _: - from logHandler import log - log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") #: The encoding internal to the underlying text info implementation. From 9cafffb0c7a80ee9c3dac6d0f3f5edd31483b36c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 22:26:13 +0000 Subject: [PATCH 90/93] Pre-commit auto-fix --- source/braille.py | 6 +++++- user_docs/en/changes.md | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/source/braille.py b/source/braille.py index 9df00baf2e3..79431273595 100644 --- a/source/braille.py +++ b/source/braille.py @@ -596,7 +596,11 @@ def update(self): textToTranslate = self.rawText textToTranslateTypeforms = self.rawTextTypeforms cursorPos = self.cursorPos - if config.conf["braille"]["translationTable"].startswith("zh") or config.conf["braille"]["translationTable"] == "auto" and brailleTables.getDefaultTableForCurLang(brailleTables.TableType.OUTPUT).startswith("zh"): + if ( + config.conf["braille"]["translationTable"].startswith("zh") + or config.conf["braille"]["translationTable"] == "auto" + and brailleTables.getDefaultTableForCurLang(brailleTables.TableType.OUTPUT).startswith("zh") + ): from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401 converter = WordSegWithSeparatorOffsetConverter(textToTranslate) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 09f128bd552..e337a9e9e5a 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -46,7 +46,11 @@ Windows 10 on ARM is also no longer supported. Windows 10 (Version 1507) is the minimum Windows version supported. We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl) * NVDA no longer supports 32bit Windows or Windows 10 on ARM. + + + >>>>>>> try-chineseWordSegmentation-staging + * Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667) * When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR) * The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898) @@ -118,8 +122,10 @@ Please open a GitHub issue if your add-on has an issue with updating to the new ======= All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) * `gui.nvdaControls.TabbableScrolledPanel` has been removed. + Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751) * The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl) + >>>>>>> try-chineseWordSegmentation-staging #### Deprecations From 096e985b97573339396c2883812e0c1b757d4f64 Mon Sep 17 00:00:00 2001 From: Wang Chong <306289287@qq.com> Date: Mon, 2 Mar 2026 12:02:27 +0800 Subject: [PATCH 91/93] Fixup for Chinese Word Segmentation and Braille Output (#19324) Summary of the issue: Some punctuations have extra separators (spaces) before or after them. Description of user facing changes: Braille output will be more accurate. --- nvdaHelper/cppjieba/sconscript | 2 +- source/textUtils/wordSeg/wordSegStrategy.py | 19 ++-- source/textUtils/wordSeg/wordSegUtils.py | 118 +++----------------- user_docs/en/changes.md | 12 +- 4 files changed, 25 insertions(+), 126 deletions(-) diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index d59fd0e3431..714c99330a9 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -42,7 +42,7 @@ env.AppendUnique( cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) -if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning +if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning env.Install( outDir.Dir("dicts"), [ diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index f59f57aca0c..ef919e99cb7 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -18,6 +18,7 @@ from collections.abc import Callable from typing import Any import re +import unicodedata import textUtils from logHandler import log @@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> if len(self.wordEnds) <= 1: return self.text - from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER - result = "" for sepIndex in range(len(self.wordEnds) - 1): preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1] @@ -305,15 +304,15 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> # separator already present at either side -> skip adding continue - # slice to check the next token (text between curIndex and postIndex) - nextSlice = self.text[curIndex:postIndex] - - # Determine whether any punctuation forbids a separator BEFORE the next token - noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE) - # Determine whether any punctuation forbids a separator AFTER the current result - noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER) + # Unicode categories for punctuation + PUNCTUATION_CATEGORIES: str = "pP" + # Determine whether any punctuation forbids a separator + noSep = ( + unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES + or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES + ) - if not (noSepBefore or noSepAfter): + if not noSep: # If neither side forbids the separator, add it result += sep if newSepIndex is not None: diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py index dff909e2945..d26a26cd9ba 100644 --- a/source/textUtils/wordSeg/wordSegUtils.py +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -19,7 +19,14 @@ def __init__(self, text: str): self.newSepIndex: list[int] = [] self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex) - @property + @cached_property + def _separatorFlag(self) -> list[bool]: + isSep = [False] * self.encodedStringLength + for pos in self.newSepIndex: + isSep[pos] = True + return isSep + + @cached_property def computedStrToEncodedOffsets(self) -> list[int]: """ Compute a list of offsets so that: @@ -32,23 +39,12 @@ def computedStrToEncodedOffsets(self) -> list[int]: original index. """ strLen = self.strLength - encodedLen = self.encodedStringLength - - # validate separator positions (optional but makes bugs obvious) - for pos in self.newSepIndex: - if pos < 0 or pos >= encodedLen: - raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") - - # mark which encoded positions are separators - isSep = [False] * encodedLen - for pos in self.newSepIndex: - isSep[pos] = True # build explicit str -> encoded mapping strToEncoded: list[int] = [0] * strLen nextStrIndex = 0 - for encodedIndex in range(encodedLen): - if not isSep[encodedIndex]: + for encodedIndex in range(self.encodedStringLength): + if not self._separatorFlag[encodedIndex]: # assign the current original-char index to this encoded slot # then advance to the next original index if nextStrIndex >= strLen: @@ -60,27 +56,15 @@ def computedStrToEncodedOffsets(self) -> list[int]: return strToEncoded - @property + @cached_property def computedEncodedToStrOffsets(self) -> list[int]: - encodedLen = self.encodedStringLength - - # validate separator positions - for pos in self.newSepIndex: - if pos < 0 or pos >= encodedLen: - raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") - - # mark which encoded positions are separators - isSep = [False] * encodedLen - for pos in self.newSepIndex: - isSep[pos] = True - # build explicit encoded -> str mapping # semantics: separator positions and the following encoded character # both map to the same upcoming original str index (insertion point semantics). - encodedToStr: list[int] = [0] * encodedLen + encodedToStr: list[int] = [0] * self.encodedStringLength nextStrIndex = 0 - for encodedIndex in range(encodedLen): - if isSep[encodedIndex]: + for encodedIndex in range(self.encodedStringLength): + if self._separatorFlag[encodedIndex]: # map separator to the next original character index (insertion point) encodedToStr[encodedIndex] = nextStrIndex else: @@ -133,77 +117,3 @@ def encodedToStrOffsets( else: resultEnd = self.computedEncodedToStrOffsets[encodedEnd] return (resultStart, resultEnd) - - -# Punctuation that should NOT have a separator BEFORE it (no space before these marks) -NO_SEP_BEFORE = { - # Common Chinese fullwidth punctuation - "。", - ",", - "、", - ";", - ":", - "?", - "!", - "…", - "...", - "—", - "–", - "——", - ")", - "】", - "》", - "〉", - "」", - "』", - "”", - "’", - "%", - "‰", - "¥", - # Common ASCII / halfwidth punctuation - ".", - ",", - ";", - ":", - "?", - "!", - "%", - ".", - ")", - "]", - "}", - ">", - '"', - "'", -} - -# Punctuation that should NOT have a separator AFTER it (no space after these marks) -NO_SEP_AFTER = { - # Common Chinese fullwidth opening/leading punctuation - "(", - "【", - "《", - "〈", - "「", - "『", - "“", - "‘", - # Common ASCII / halfwidth opening/leading punctuation - "(", - "[", - "{", - "<", - '"', - "'", - # Currency and prefix-like symbols that typically bind to the following token - "$", - "€", - "£", - "¥", - "₹", - # Social/identifier prefixes - "@", - "#", - "&", -} diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index e337a9e9e5a..6e3740b52bf 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -33,7 +33,7 @@ Windows 10 on ARM is also no longer supported. * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes) * Chinese text can be navigated by word via build-in input gestures. Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605) -* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605) +* Braille output for Chinese contains spaces as word separators. (#18865, @CrazySteve0605) * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes) ### Changes @@ -47,10 +47,6 @@ Windows 10 (Version 1507) is the minimum Windows version supported. We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl) * NVDA no longer supports 32bit Windows or Windows 10 on ARM. - - ->>>>>>> try-chineseWordSegmentation-staging - * Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667) * When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR) * The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898) @@ -117,17 +113,11 @@ Please open a GitHub issue if your add-on has an issue with updating to the new * the `rgpszUsageIdentifier` member of the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation. * The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes) * `visionEnhancementProviders.screenCurtain.Magnification` has been removed. -<<<<<<< HEAD - All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) -======= All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) * `gui.nvdaControls.TabbableScrolledPanel` has been removed. - Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751) * The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl) ->>>>>>> try-chineseWordSegmentation-staging - #### Deprecations * `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl): From 072b40523c4e96316d06806dfb3b74192e38c63f Mon Sep 17 00:00:00 2001 From: cary-rowen Date: Thu, 9 Apr 2026 22:00:30 +0800 Subject: [PATCH 92/93] Fix word expansion without flowsTo --- source/textInfos/offsets.py | 14 +++++++------- tests/unit/test_textInfos.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 95d7046a301..11d8c297314 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -583,13 +583,13 @@ def collapse(self, end=False): self._startOffset = self._endOffset def expand(self, unit): - if ( - unit == textInfos.UNIT_WORD - and self.isCollapsed - and not self.obj.flowsTo - and self._startOffset == self._getStoryLength() - ): - return + if unit == textInfos.UNIT_WORD and self.isCollapsed and self._startOffset == self._getStoryLength(): + try: + flowsTo = self.obj.flowsTo + except (AttributeError, NotImplementedError): + flowsTo = None + if not flowsTo: + return self._startOffset, self._endOffset = self._getUnitOffsets(unit, self._startOffset) def copy(self): diff --git a/tests/unit/test_textInfos.py b/tests/unit/test_textInfos.py index 5ada05ff6ab..af3ca1d0c20 100644 --- a/tests/unit/test_textInfos.py +++ b/tests/unit/test_textInfos.py @@ -176,6 +176,22 @@ def test_setEndpoint(self): self.assertEqual((ti1._startOffset, ti1._endOffset), (5, 5)) +class TestWordExpansion(unittest.TestCase): + def test_expandWordDoesNotRequireFlowsToBeforeEndOfStory(self): + obj = BasicTextProvider(text="one two") + ti = obj.makeTextInfo(Offsets(0, 0)) + ti.expand(textInfos.UNIT_WORD) + self.assertEqual(ti.text, "one") + + def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self): + obj = BasicTextProvider(text="one two") + ti = obj.makeTextInfo(textInfos.POSITION_ALL) + ti.collapse(end=True) + ti.expand(textInfos.UNIT_WORD) + self.assertEqual(ti.text, "") + self.assertEqual(ti.offsets, (7, 7)) + + class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase): THREE_CHARS = "012" TEN_CHARS = "0123456789" From 4c9d616b0a9b3ca8c8edc1181edaf4254296ddaa Mon Sep 17 00:00:00 2001 From: cary-rowen Date: Thu, 9 Apr 2026 22:19:56 +0800 Subject: [PATCH 93/93] Fix textInfo word expansion test expectation --- tests/unit/test_textInfos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_textInfos.py b/tests/unit/test_textInfos.py index af3ca1d0c20..882e58782c3 100644 --- a/tests/unit/test_textInfos.py +++ b/tests/unit/test_textInfos.py @@ -181,7 +181,7 @@ def test_expandWordDoesNotRequireFlowsToBeforeEndOfStory(self): obj = BasicTextProvider(text="one two") ti = obj.makeTextInfo(Offsets(0, 0)) ti.expand(textInfos.UNIT_WORD) - self.assertEqual(ti.text, "one") + self.assertEqual(ti.text, "one ") def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self): obj = BasicTextProvider(text="one two")