nvaccess · CrazySteve0605 · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -15,7 +15,11 @@ JiebaSingleton* JiebaSingleton::instance = nullptr;
 std::once_flag JiebaSingleton::initFlag;
 
 JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) {
-    // convert incoming C-string+length to std::string (handles dictDir == nullptr)
+    if (!dictDir) {
+        throw std::invalid_argument("JiebaSingleton::getInstance() requires a non-null dictionary path.");
+    }
+
+    // convert incoming C-string to std::string before entering call_once
     std::string dir = dictDir;
 
     // ensure singleton is constructed exactly once
@@ -64,6 +68,8 @@ void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector<int>
 extern "C" {
 
 bool initJieba(const char* dictDir) {
+    if (!dictDir) return false;
+
     try {
         // simply force the singleton into existence
         (void)JiebaSingleton::getInstance(dictDir);
@@ -105,16 +111,34 @@ bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) {
     }
 }
 
-bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) {
-	return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag));
+bool insertUserWord(const char* word, int freq, const char* tag) {
+    if (!word || !tag) return false;
+
+    try {
+        return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag));
+    } catch (...) {
+        return false;
+    }
 }
 
-bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) {
-	return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag));
+bool deleteUserWord(const char* word, const char* tag) {
+    if (!word || !tag) return false;
+
+    try {
+        return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag));
+    } catch (...) {
+        return false;
+    }
 }
 
 bool find(const char* word) {
-	return JiebaSingleton::getInstance().Find(string(word));
+    if (!word) return false;
+
+    try {
+        return JiebaSingleton::getInstance().Find(string(word));
+    } catch (...) {
+        return false;
+    }
 }
 
 void freeOffsets(int* ptr) {

@@ -27,8 +27,8 @@ We currently have the following templates:
   * Please note that these are reported differently, for more information refer to our [disclosure policy/procedure](https://github.com/nvaccess/nvda/blob/master/security.md)
 * Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations).
 These include:
-  * NVDA interface text that is incorrect in languages other than English
-  * Contents of the User Guide and Changes documents that are incorrect in languages other than English
+  * NVDA interface text that is incorrect in languages other than English
+  * Contents of the User Guide and Changes documents that are incorrect in languages other than English
   * Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English
 
 These templates are fillable forms that guide you through the process of providing the necessary information for your issue.

@@ -10,8 +10,8 @@ Do not report security concerns via GitHub issues, instead follow our [security
 Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations).
 These include:
 
-* NVDA interface text that is incorrect in languages other than English
-* Contents of the User Guide and Changes documents that are incorrect in languages other than English
+* NVDA interface text that is incorrect in languages other than English
+* Contents of the User Guide and Changes documents that are incorrect in languages other than English
 * Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English
 
 If you are reporting an issue with an application or website, please consider reporting the issue to the [authors of the application/website](./thirdPartyReporting.md) first.

@@ -280,6 +280,7 @@
 	reportClickable = boolean(default=true)
 
 [documentNavigation]
+	# Hidden option to eagerly initialize Chinese word segmentation even when the current languages do not use it.
 	initWordSegForUnusedLang = boolean(default=false)
 	wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto")
 	paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application")

@@ -171,7 +171,8 @@ def wordSegFlag(self) -> WordSegFlag | None:
 			case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
 				return WordSegFlag.CHINESE
 			case _:
-				log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}")
+				log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}")
+				return None
 
 	#: The encoding internal to the underlying text info implementation.
 	encoding: Optional[str] = textUtils.WCHAR_ENCODING

@@ -602,7 +602,9 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag
 		self.wordSegFlag: WordSegFlag = wordSegFlag
 		self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy()
 
-	def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy:  # TODO: optimize
+	def _chooseStrategy(
+		self,
+	) -> wordSegStrategy.WordSegmentationStrategy:  # TODO: Limit regex scans for large text.
 		"""Choose the appropriate segmentation strategy based on the text content."""
 		if self.wordSegFlag == WordSegFlag.AUTO:
 			if (
@@ -627,6 +629,7 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy:  # TODO:
 						return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
 				case _:
 					return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
+		return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
 
 	def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
 		"""Get the segment containing the given offset."""

@@ -43,4 +43,3 @@ def initialize():
 			Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start()
 		except Exception as e:
 			log.debug("Initializer %s.%s failed: %s", module_name, qualname, e)
-		return
@@ -4,12 +4,12 @@
 # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 import os
-import ctypes
 from ctypes import (
 	c_bool,
 	c_char_p,
 	c_int,
 	create_string_buffer,
+	cdll,
 	POINTER,
 	byref,
 )
@@ -122,8 +122,8 @@ def _calculateUniscribeOffsets(
 
 		helperFunc = NVDAHelper.localLib.calculateWordOffsets
 
-		relStart = ctypes.c_int()
-		relEnd = ctypes.c_int()
+		relStart = c_int()
+		relEnd = c_int()
 		# uniscribe does some strange things
 		# when you give it a string  with not more than two alphanumeric chars in a row.
 		# Inject two alphanumeric characters at the end to fix this
@@ -139,8 +139,8 @@ def _calculateUniscribeOffsets(
 			uniscribeLineText,
 			uniscribeLineLength,
 			relOffset,
-			ctypes.byref(relStart),
-			ctypes.byref(relEnd),
+			byref(relStart),
+			byref(relEnd),
 		):
 			relStart = relStart.value
 			relEnd = min(lineLength, relEnd.value)
@@ -163,27 +163,31 @@ class ChineseWordSegmentationStrategy(WordSegmentationStrategy):
 
 	@classmethod
 	@initializerRegistry
-	def _initCppJieba(cls, forceInit: bool = False):  # TODO: make cppjieba alternative
+	def _initCppJieba(cls, forceInit: bool = False):  # TODO: Add a fallback when cppjieba.dll is unavailable.
 		"""
 		Class-level initializer: attempts to load the versioned cppjieba library and
 		set up ctypes signatures.
 		"""
 		import config
 
-		if not forceInit and (
-			cls._lib
-			or (
-				config.conf["documentNavigation"]["wordSegmentationStandard"].calculated()
-				!= config.featureFlagEnums.WordNavigationUnitFlag.CHINESE
-				and not cls.isUsingRelatedLanguage()
-			)
-		):
+		if cls._lib:
 			return
+
+		if not forceInit:
+			documentNavigationConf = config.conf["documentNavigation"]
+			shouldInit = (
+				documentNavigationConf["wordSegmentationStandard"].calculated()
+				== config.featureFlagEnums.WordNavigationUnitFlag.CHINESE
+				or cls.isUsingRelatedLanguage()
+				or documentNavigationConf["initWordSegForUnusedLang"]
+			)
+			if not shouldInit:
+				return
 		try:
 			from NVDAState import ReadPaths
 
 			lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll")
-			cls._lib = ctypes.cdll.LoadLibrary(lib_path)
+			cls._lib = cdll.LoadLibrary(lib_path)
 
 			# Setup function signatures
 			# bool initJieba(const char* dictDir)
@@ -222,17 +226,17 @@ def _initCppJieba(cls, forceInit: bool = False):  # TODO: make cppjieba alternat
 			cls._lib = None
 
 	@lru_cache(maxsize=256)
-	def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None:
+	def _callCppjiebaCached(self, text_utf8: bytes) -> list[int]:
 		if self._lib is None:
-			return None
+			return []
 
 		charPtr = POINTER(c_int)()
 		outLen = c_int(0)
 
 		try:
 			success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen))
 			if not success or not bool(charPtr) or outLen.value <= 0:
-				return None
+				return []
 
 			try:
 				n = outLen.value
@@ -245,14 +249,14 @@ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None:
 			try:
 				if bool(charPtr):
 					self._lib.freeOffsets(charPtr)
-			except Exception:
-				pass
-			return None
+			except Exception as cleanupErr:
+				log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr)
+			return []
 
-	def _callCPPJieba(self) -> list[int] | None:
+	def _callCPPJieba(self) -> list[int]:
 		"""
 		Instance method: encode self.text and call cppjieba.
-		Returns list[int] on success, None on failure.
+		Returns list[int] on success, or an empty list on failure.
 		Uses LRU cache keyed by utf-8 bytes.
 		"""
 		data = self.text.encode("utf-8")
@@ -261,14 +265,14 @@ def _callCPPJieba(self) -> list[int] | None:
 			return self._callCppjiebaCached(data)
 		else:
 			if self._lib is None:
-				return None
+				return []
 
 			charPtr = POINTER(c_int)()
 			outLen = c_int(0)
 			try:
 				success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen))
 				if not success or not bool(charPtr) or outLen.value <= 0:
-					return None
+					return []
 
 				try:
 					n = outLen.value
@@ -280,9 +284,9 @@ def _callCPPJieba(self) -> list[int] | None:
 				try:
 					if bool(charPtr):
 						self._lib.freeOffsets(charPtr)
-				except Exception:
-					pass
-				return None
+				except Exception as cleanupErr:
+					log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr)
+				return []
 
 	def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
 		"""Segments the text using the word end indices."""
@@ -317,9 +321,8 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
 				result += sep
 				if newSepIndex is not None:
 					newSepIndex.append(len(result) - len(sep))
-		else:
-			# append the final trailing token after the loop
-			result += self.text[curIndex:postIndex]
+		# append the final trailing token after the loop
+		result += self.text[curIndex:postIndex]
 
 		return result
 

@@ -29,14 +29,11 @@ def _separatorFlag(self) -> list[bool]:
 	@cached_property
 	def computedStrToEncodedOffsets(self) -> list[int]:
 		"""
-		Compute a list of offsets so that:
-			encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex]
-
 		We build an explicit mapping from original string indices to encoded indices
 		by marking separator positions in the encoded string and then assigning
 		each non-separator encoded slot to the next original-character index.
-		The returned list contains the delta (encodedIndex - strIndex) for each
-		original index.
+		The returned list contains the absolute encoded index for each original
+		character index.
 		"""
 		strLen = self.strLength
 
@@ -80,40 +77,46 @@ def encodedStringLength(self) -> int:
 		"""Returns the length of the string in its subclass-specific encoded representation."""
 		return len(self.encoded)
 
+	def _strOffsetToEncodedOffset(self, offset: int) -> int:
+		if offset == self.strLength:
+			return self.encodedStringLength
+		return self.computedStrToEncodedOffsets[offset]
+
+	def _encodedOffsetToStrOffset(self, offset: int) -> int:
+		if offset == self.encodedStringLength:
+			return self.strLength
+		return self.computedEncodedToStrOffsets[offset]
+
 	def strToEncodedOffsets(
 		self,
 		strStart: int,
 		strEnd: int | None = None,
 		raiseOnError: bool = False,
 	) -> int | tuple[int, int]:
 		super().strToEncodedOffsets(strStart, strEnd, raiseOnError)
-		if strStart == 0:
-			resultStart = 0
-		else:
-			resultStart = self.computedStrToEncodedOffsets[strStart]
+		strStart = max(0, min(strStart, self.strLength))
+		resultStart = self._strOffsetToEncodedOffset(strStart)
 		if strEnd is None:
 			return resultStart
-		elif strStart == strEnd:
+		strEnd = max(0, min(strEnd, self.strLength))
+		if strStart == strEnd:
 			return (resultStart, resultStart)
-		else:
-			resultEnd = self.computedStrToEncodedOffsets[strEnd]
-			return (resultStart, resultEnd)
+		resultEnd = self._strOffsetToEncodedOffset(strEnd)
+		return (resultStart, resultEnd)
 
 	def encodedToStrOffsets(
 		self,
 		encodedStart: int,
 		encodedEnd: int | None = None,
 		raiseOnError: bool = False,
-	) -> int | tuple[int]:
+	) -> int | tuple[int, int]:
 		super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError)
-		if encodedStart == 0:
-			resultStart = 0
-		else:
-			resultStart = self.computedEncodedToStrOffsets[encodedStart]
+		encodedStart = max(0, min(encodedStart, self.encodedStringLength))
+		resultStart = self._encodedOffsetToStrOffset(encodedStart)
 		if encodedEnd is None:
 			return resultStart
-		elif encodedStart == encodedEnd:
+		encodedEnd = max(0, min(encodedEnd, self.encodedStringLength))
+		if encodedStart == encodedEnd:
 			return (resultStart, resultStart)
-		else:
-			resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
-			return (resultStart, resultEnd)
+		resultEnd = self._encodedOffsetToStrOffset(encodedEnd)
+		return (resultStart, resultEnd)
@@ -8,6 +8,7 @@
 """Unit tests for the textInfos module, its submodules and classes."""
 
 import unittest
+from unittest.mock import patch
 from .textProvider import BasicTextProvider, MockBlackBoxTextInfo
 import textInfos
 from textInfos.offsets import Offsets
@@ -192,6 +193,22 @@ def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self):
 		self.assertEqual(ti.offsets, (7, 7))
 
 
+class TestWordSegFlag(unittest.TestCase):
+	class _UnknownWordSegConf:
+		def calculated(self):
+			return "unexpected"
+
+	def test_unknownWordSegConfigReturnsNoneAfterLogging(self):
+		obj = BasicTextProvider(text="abc")
+		ti = obj.makeTextInfo(Offsets(0, 0))
+		ti.wordSegConf = self._UnknownWordSegConf()
+
+		with patch("textInfos.offsets.log.error") as mockLogError:
+			self.assertIsNone(ti.wordSegFlag)
+
+		mockLogError.assert_called_once_with("Unknown word segmentation standard, 'unexpected'")
+
+
 class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase):
 	THREE_CHARS = "012"
 	TEN_CHARS = "0123456789"