nvaccess · seanbudd · Mar 2, 2026 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
@@ -42,7 +42,7 @@ env.AppendUnique(
 
 cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)
 
-if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning
+if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning
 	env.Install(
 		outDir.Dir("dicts"),
 		[

@@ -18,6 +18,7 @@
 from collections.abc import Callable
 from typing import Any
 import re
+import unicodedata
 
 import textUtils
 from logHandler import log
@@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
 		if len(self.wordEnds) <= 1:
 			return self.text
 
-		from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER
-
 		result = ""
 		for sepIndex in range(len(self.wordEnds) - 1):
 			preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
@@ -305,15 +304,15 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
 				# separator already present at either side -> skip adding
 				continue
 
-			# slice to check the next token (text between curIndex and postIndex)
-			nextSlice = self.text[curIndex:postIndex]
-
-			# Determine whether any punctuation forbids a separator BEFORE the next token
-			noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE)
-			# Determine whether any punctuation forbids a separator AFTER the current result
-			noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER)
+			# Unicode categories for punctuation
+			PUNCTUATION_CATEGORIES: str = "pP" 
+			# Determine whether any punctuation forbids a separator
+			noSep = (
+				unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES
+				or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES
+			)
 
-			if not (noSepBefore or noSepAfter):
+			if not noSep:
 				# If neither side forbids the separator, add it
 				result += sep
 				if newSepIndex is not None:

@@ -19,7 +19,14 @@ def __init__(self, text: str):
 		self.newSepIndex: list[int] = []
 		self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex)
 
-	@property
+	@cached_property
+	def _separatorFlag(self) -> list[bool]:
+		isSep = [False] * self.encodedStringLength
+		for pos in self.newSepIndex:
+			isSep[pos] = True
+		return isSep
+
+	@cached_property
 	def computedStrToEncodedOffsets(self) -> list[int]:
 		"""
 		Compute a list of offsets so that:
@@ -32,23 +39,12 @@ def computedStrToEncodedOffsets(self) -> list[int]:
 		original index.
 		"""
 		strLen = self.strLength
-		encodedLen = self.encodedStringLength
-
-		# validate separator positions (optional but makes bugs obvious)
-		for pos in self.newSepIndex:
-			if pos < 0 or pos >= encodedLen:
-				raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")
-
-		# mark which encoded positions are separators
-		isSep = [False] * encodedLen
-		for pos in self.newSepIndex:
-			isSep[pos] = True
 
 		# build explicit str -> encoded mapping
 		strToEncoded: list[int] = [0] * strLen
 		nextStrIndex = 0
-		for encodedIndex in range(encodedLen):
-			if not isSep[encodedIndex]:
+		for encodedIndex in range(self.encodedStringLength):
+			if not self._separatorFlag[encodedIndex]:
 				# assign the current original-char index to this encoded slot
 				# then advance to the next original index
 				if nextStrIndex >= strLen:
@@ -60,27 +56,15 @@ def computedStrToEncodedOffsets(self) -> list[int]:
 
 		return strToEncoded
 
-	@property
+	@cached_property
 	def computedEncodedToStrOffsets(self) -> list[int]:
-		encodedLen = self.encodedStringLength
-
-		# validate separator positions
-		for pos in self.newSepIndex:
-			if pos < 0 or pos >= encodedLen:
-				raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")
-
-		# mark which encoded positions are separators
-		isSep = [False] * encodedLen
-		for pos in self.newSepIndex:
-			isSep[pos] = True
-
 		# build explicit encoded -> str mapping
 		# semantics: separator positions and the following encoded character
 		# both map to the same upcoming original str index (insertion point semantics).
-		encodedToStr: list[int] = [0] * encodedLen
+		encodedToStr: list[int] = [0] * self.encodedStringLength
 		nextStrIndex = 0
-		for encodedIndex in range(encodedLen):
-			if isSep[encodedIndex]:
+		for encodedIndex in range(self.encodedStringLength):
+			if self._separatorFlag[encodedIndex]:
 				# map separator to the next original character index (insertion point)
 				encodedToStr[encodedIndex] = nextStrIndex
 			else:
@@ -133,77 +117,3 @@ def encodedToStrOffsets(
 		else:
 			resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
 			return (resultStart, resultEnd)
-
-
-# Punctuation that should NOT have a separator BEFORE it (no space before these marks)
-NO_SEP_BEFORE = {
-	# Common Chinese fullwidth punctuation
-	"。",
-	"，",
-	"、",
-	"；",
-	"：",
-	"？",
-	"！",
-	"…",
-	"...",
-	"—",
-	"–",
-	"——",
-	"）",
-	"】",
-	"》",
-	"〉",
-	"」",
-	"』",
-	"”",
-	"’",
-	"％",
-	"‰",
-	"￥",
-	# Common ASCII / halfwidth punctuation
-	".",
-	",",
-	";",
-	":",
-	"?",
-	"!",
-	"%",
-	".",
-	")",
-	"]",
-	"}",
-	">",
-	'"',
-	"'",
-}
-
-# Punctuation that should NOT have a separator AFTER it (no space after these marks)
-NO_SEP_AFTER = {
-	# Common Chinese fullwidth opening/leading punctuation
-	"（",
-	"【",
-	"《",
-	"〈",
-	"「",
-	"『",
-	"“",
-	"‘",
-	# Common ASCII / halfwidth opening/leading punctuation
-	"(",
-	"[",
-	"{",
-	"<",
-	'"',
-	"'",
-	# Currency and prefix-like symbols that typically bind to the following token
-	"$",
-	"€",
-	"£",
-	"¥",
-	"₹",
-	# Social/identifier prefixes
-	"@",
-	"#",
-	"&",
-}
@@ -33,7 +33,7 @@ Windows 10 on ARM is also no longer supported.
 * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes)
 * Chinese text can be navigated by word via build-in input gestures.
   Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605)
-* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605)
+* Braille output for Chinese contains spaces as word separators. (#18865, @CrazySteve0605)
 * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes)
 
 ### Changes
@@ -47,10 +47,6 @@ Windows 10 (Version 1507) is the minimum Windows version supported.
 We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl)
 * NVDA no longer supports 32bit Windows or Windows 10 on ARM.
 
-
-
->>>>>>> try-chineseWordSegmentation-staging
-
 * Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667)
 * When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR)
 * The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898)
@@ -117,17 +113,11 @@ Please open a GitHub issue if your add-on has an issue with updating to the new
 * the `rgpszUsageIdentifier` member of  the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation.
 * The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes)
 * `visionEnhancementProviders.screenCurtain.Magnification` has been removed.
-<<<<<<< HEAD
-  All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
-=======
 All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
 * `gui.nvdaControls.TabbableScrolledPanel` has been removed.
-
 Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751)
 * The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl)
 
->>>>>>> try-chineseWordSegmentation-staging
-
 #### Deprecations
 
 * `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl):