Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nvdaHelper/cppjieba/sconscript
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ env.AppendUnique(

cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)

if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning
if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning
env.Install(
outDir.Dir("dicts"),
[
Expand Down
19 changes: 9 additions & 10 deletions source/textUtils/wordSeg/wordSegStrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from collections.abc import Callable
from typing import Any
import re
import unicodedata

import textUtils
from logHandler import log
Expand Down Expand Up @@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
if len(self.wordEnds) <= 1:
return self.text

from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER

result = ""
for sepIndex in range(len(self.wordEnds) - 1):
preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
Expand All @@ -305,15 +304,15 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
# separator already present at either side -> skip adding
continue

# slice to check the next token (text between curIndex and postIndex)
nextSlice = self.text[curIndex:postIndex]

# Determine whether any punctuation forbids a separator BEFORE the next token
noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE)
# Determine whether any punctuation forbids a separator AFTER the current result
noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER)
# Unicode categories for punctuation
PUNCTUATION_CATEGORIES: str = "pP"
# Determine whether any punctuation forbids a separator
noSep = (
unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES
or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES
)

if not (noSepBefore or noSepAfter):
if not noSep:
# If neither side forbids the separator, add it
result += sep
if newSepIndex is not None:
Expand Down
118 changes: 14 additions & 104 deletions source/textUtils/wordSeg/wordSegUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@ def __init__(self, text: str):
self.newSepIndex: list[int] = []
self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex)

@property
@cached_property
def _separatorFlag(self) -> list[bool]:
isSep = [False] * self.encodedStringLength
for pos in self.newSepIndex:
isSep[pos] = True
return isSep

@cached_property
def computedStrToEncodedOffsets(self) -> list[int]:
"""
Compute a list of offsets so that:
Expand All @@ -32,23 +39,12 @@ def computedStrToEncodedOffsets(self) -> list[int]:
original index.
"""
strLen = self.strLength
encodedLen = self.encodedStringLength

# validate separator positions (optional but makes bugs obvious)
for pos in self.newSepIndex:
if pos < 0 or pos >= encodedLen:
raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")

# mark which encoded positions are separators
isSep = [False] * encodedLen
for pos in self.newSepIndex:
isSep[pos] = True

# build explicit str -> encoded mapping
strToEncoded: list[int] = [0] * strLen
nextStrIndex = 0
for encodedIndex in range(encodedLen):
if not isSep[encodedIndex]:
for encodedIndex in range(self.encodedStringLength):
if not self._separatorFlag[encodedIndex]:
# assign the current original-char index to this encoded slot
# then advance to the next original index
if nextStrIndex >= strLen:
Expand All @@ -60,27 +56,15 @@ def computedStrToEncodedOffsets(self) -> list[int]:

return strToEncoded

@property
@cached_property
def computedEncodedToStrOffsets(self) -> list[int]:
encodedLen = self.encodedStringLength

# validate separator positions
for pos in self.newSepIndex:
if pos < 0 or pos >= encodedLen:
raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")

# mark which encoded positions are separators
isSep = [False] * encodedLen
for pos in self.newSepIndex:
isSep[pos] = True

# build explicit encoded -> str mapping
# semantics: separator positions and the following encoded character
# both map to the same upcoming original str index (insertion point semantics).
encodedToStr: list[int] = [0] * encodedLen
encodedToStr: list[int] = [0] * self.encodedStringLength
nextStrIndex = 0
for encodedIndex in range(encodedLen):
if isSep[encodedIndex]:
for encodedIndex in range(self.encodedStringLength):
if self._separatorFlag[encodedIndex]:
# map separator to the next original character index (insertion point)
encodedToStr[encodedIndex] = nextStrIndex
else:
Expand Down Expand Up @@ -133,77 +117,3 @@ def encodedToStrOffsets(
else:
resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
return (resultStart, resultEnd)


# Punctuation that should NOT have a separator BEFORE it (no space before these marks)
NO_SEP_BEFORE = {
# Common Chinese fullwidth punctuation
"。",
",",
"、",
";",
":",
"?",
"!",
"…",
"...",
"—",
"–",
"——",
")",
"】",
"》",
"〉",
"」",
"』",
"”",
"’",
"%",
"‰",
"¥",
# Common ASCII / halfwidth punctuation
".",
",",
";",
":",
"?",
"!",
"%",
".",
")",
"]",
"}",
">",
'"',
"'",
}

# Punctuation that should NOT have a separator AFTER it (no space after these marks)
NO_SEP_AFTER = {
# Common Chinese fullwidth opening/leading punctuation
"(",
"【",
"《",
"〈",
"「",
"『",
"“",
"‘",
# Common ASCII / halfwidth opening/leading punctuation
"(",
"[",
"{",
"<",
'"',
"'",
# Currency and prefix-like symbols that typically bind to the following token
"$",
"€",
"£",
"¥",
"₹",
# Social/identifier prefixes
"@",
"#",
"&",
}
12 changes: 1 addition & 11 deletions user_docs/en/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Windows 10 on ARM is also no longer supported.
* In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes)
* Chinese text can be navigated by word via build-in input gestures.
Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605)
* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605)
* Braille output for Chinese contains spaces as word separators. (#18865, @CrazySteve0605)
* In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes)

### Changes
Expand All @@ -47,10 +47,6 @@ Windows 10 (Version 1507) is the minimum Windows version supported.
We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl)
* NVDA no longer supports 32bit Windows or Windows 10 on ARM.



>>>>>>> try-chineseWordSegmentation-staging

* Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667)
* When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR)
* The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898)
Expand Down Expand Up @@ -117,17 +113,11 @@ Please open a GitHub issue if your add-on has an issue with updating to the new
* the `rgpszUsageIdentifier` member of the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation.
* The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes)
* `visionEnhancementProviders.screenCurtain.Magnification` has been removed.
<<<<<<< HEAD
All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
=======
All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
* `gui.nvdaControls.TabbableScrolledPanel` has been removed.

Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751)
* The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl)

>>>>>>> try-chineseWordSegmentation-staging

#### Deprecations

* `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl):
Expand Down
Loading