Skip to content
36 changes: 30 additions & 6 deletions nvdaHelper/cppjieba/cppjieba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ JiebaSingleton* JiebaSingleton::instance = nullptr;
std::once_flag JiebaSingleton::initFlag;

JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) {
// convert incoming C-string+length to std::string (handles dictDir == nullptr)
if (!dictDir) {
throw std::invalid_argument("JiebaSingleton::getInstance() requires a non-null dictionary path.");
}

// convert incoming C-string to std::string before entering call_once
std::string dir = dictDir;

// ensure singleton is constructed exactly once
Expand Down Expand Up @@ -64,6 +68,8 @@ void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector<int>
extern "C" {

bool initJieba(const char* dictDir) {
if (!dictDir) return false;

try {
// simply force the singleton into existence
(void)JiebaSingleton::getInstance(dictDir);
Expand Down Expand Up @@ -105,16 +111,34 @@ bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) {
}
}

bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) {
return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag));
bool insertUserWord(const char* word, int freq, const char* tag) {
if (!word || !tag) return false;

try {
return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag));
} catch (...) {
return false;
}
}

bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) {
return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag));
bool deleteUserWord(const char* word, const char* tag) {
if (!word || !tag) return false;

try {
return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag));
} catch (...) {
return false;
}
}

bool find(const char* word) {
return JiebaSingleton::getInstance().Find(string(word));
if (!word) return false;

try {
return JiebaSingleton::getInstance().Find(string(word));
} catch (...) {
return false;
}
}

void freeOffsets(int* ptr) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ We currently have the following templates:
* Please note that these are reported differently, for more information refer to our [disclosure policy/procedure](https://github.com/nvaccess/nvda/blob/master/security.md)
* Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations).
These include:
* NVDA interface text that is incorrect in languages other than English
* Contents of the User Guide and Changes documents that are incorrect in languages other than English
* NVDA interface text that is incorrect in languages other than English
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is an unexpected change. Could you please confirm it?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mistakenly applied the changes suggested by the AI, and need to restore the previous state.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems from commit b34a049 and I've not been familiar with how it works.

* Contents of the User Guide and Changes documents that are incorrect in languages other than English
* Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English

These templates are fillable forms that guide you through the process of providing the necessary information for your issue.
Expand Down
4 changes: 2 additions & 2 deletions projectDocs/issues/readme.md
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is an unexpected change. Could you please confirm it?

Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ Do not report security concerns via GitHub issues, instead follow our [security
Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations).
These include:

* NVDA interface text that is incorrect in languages other than English
* Contents of the User Guide and Changes documents that are incorrect in languages other than English
* NVDA interface text that is incorrect in languages other than English
* Contents of the User Guide and Changes documents that are incorrect in languages other than English
* Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English

If you are reporting an issue with an application or website, please consider reporting the issue to the [authors of the application/website](./thirdPartyReporting.md) first.
Expand Down
1 change: 1 addition & 0 deletions source/config/configSpec.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@
reportClickable = boolean(default=true)
[documentNavigation]
# Hidden option to eagerly initialize Chinese word segmentation even when the current languages do not use it.
initWordSegForUnusedLang = boolean(default=false)
wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto")
paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application")
Expand Down
3 changes: 2 additions & 1 deletion source/textInfos/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def wordSegFlag(self) -> WordSegFlag | None:
case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
return WordSegFlag.CHINESE
case _:
log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}")
log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}")
return None

#: The encoding internal to the underlying text info implementation.
encoding: Optional[str] = textUtils.WCHAR_ENCODING
Expand Down
5 changes: 4 additions & 1 deletion source/textUtils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,9 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag
self.wordSegFlag: WordSegFlag = wordSegFlag
self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy()

def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize
def _chooseStrategy(
self,
) -> wordSegStrategy.WordSegmentationStrategy: # TODO: Limit regex scans for large text.
"""Choose the appropriate segmentation strategy based on the text content."""
if self.wordSegFlag == WordSegFlag.AUTO:
if (
Expand All @@ -627,6 +629,7 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
case _:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)

def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
"""Get the segment containing the given offset."""
Expand Down
1 change: 0 additions & 1 deletion source/textUtils/wordSeg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,3 @@ def initialize():
Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start()
except Exception as e:
log.debug("Initializer %s.%s failed: %s", module_name, qualname, e)
return
65 changes: 34 additions & 31 deletions source/textUtils/wordSeg/wordSegStrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

import os
import ctypes
from ctypes import (
c_bool,
c_char_p,
c_int,
create_string_buffer,
cdll,
POINTER,
byref,
)
Expand Down Expand Up @@ -122,8 +122,8 @@ def _calculateUniscribeOffsets(

helperFunc = NVDAHelper.localLib.calculateWordOffsets

relStart = ctypes.c_int()
relEnd = ctypes.c_int()
relStart = c_int()
relEnd = c_int()
# uniscribe does some strange things
# when you give it a string with not more than two alphanumeric chars in a row.
# Inject two alphanumeric characters at the end to fix this
Expand All @@ -139,8 +139,8 @@ def _calculateUniscribeOffsets(
uniscribeLineText,
uniscribeLineLength,
relOffset,
ctypes.byref(relStart),
ctypes.byref(relEnd),
byref(relStart),
byref(relEnd),
):
relStart = relStart.value
relEnd = min(lineLength, relEnd.value)
Expand All @@ -163,27 +163,31 @@ class ChineseWordSegmentationStrategy(WordSegmentationStrategy):

@classmethod
@initializerRegistry
def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative
def _initCppJieba(cls, forceInit: bool = False): # TODO: Add a fallback when cppjieba.dll is unavailable.
"""
Class-level initializer: attempts to load the versioned cppjieba library and
set up ctypes signatures.
"""
import config

if not forceInit and (
cls._lib
or (
config.conf["documentNavigation"]["wordSegmentationStandard"].calculated()
!= config.featureFlagEnums.WordNavigationUnitFlag.CHINESE
and not cls.isUsingRelatedLanguage()
)
):
if cls._lib:
return

if not forceInit:
documentNavigationConf = config.conf["documentNavigation"]
shouldInit = (
documentNavigationConf["wordSegmentationStandard"].calculated()
== config.featureFlagEnums.WordNavigationUnitFlag.CHINESE
or cls.isUsingRelatedLanguage()
or documentNavigationConf["initWordSegForUnusedLang"]
)
if not shouldInit:
return
try:
from NVDAState import ReadPaths

lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll")
cls._lib = ctypes.cdll.LoadLibrary(lib_path)
cls._lib = cdll.LoadLibrary(lib_path)

# Setup function signatures
# bool initJieba(const char* dictDir)
Expand Down Expand Up @@ -222,17 +226,17 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat
cls._lib = None

@lru_cache(maxsize=256)
def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None:
def _callCppjiebaCached(self, text_utf8: bytes) -> list[int]:
if self._lib is None:
return None
return []

charPtr = POINTER(c_int)()
outLen = c_int(0)

try:
success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen))
if not success or not bool(charPtr) or outLen.value <= 0:
return None
return []

try:
n = outLen.value
Expand All @@ -245,14 +249,14 @@ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None:
try:
if bool(charPtr):
self._lib.freeOffsets(charPtr)
except Exception:
pass
return None
except Exception as cleanupErr:
log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr)
return []

def _callCPPJieba(self) -> list[int] | None:
def _callCPPJieba(self) -> list[int]:
"""
Instance method: encode self.text and call cppjieba.
Returns list[int] on success, None on failure.
Returns list[int] on success, or an empty list on failure.
Uses LRU cache keyed by utf-8 bytes.
"""
data = self.text.encode("utf-8")
Expand All @@ -261,14 +265,14 @@ def _callCPPJieba(self) -> list[int] | None:
return self._callCppjiebaCached(data)
else:
if self._lib is None:
return None
return []

charPtr = POINTER(c_int)()
outLen = c_int(0)
try:
success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen))
if not success or not bool(charPtr) or outLen.value <= 0:
return None
return []

try:
n = outLen.value
Expand All @@ -280,9 +284,9 @@ def _callCPPJieba(self) -> list[int] | None:
try:
if bool(charPtr):
self._lib.freeOffsets(charPtr)
except Exception:
pass
return None
except Exception as cleanupErr:
log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr)
return []

def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
"""Segments the text using the word end indices."""
Expand Down Expand Up @@ -317,9 +321,8 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
result += sep
if newSepIndex is not None:
newSepIndex.append(len(result) - len(sep))
else:
# append the final trailing token after the loop
result += self.text[curIndex:postIndex]
# append the final trailing token after the loop
result += self.text[curIndex:postIndex]

return result

Expand Down
47 changes: 25 additions & 22 deletions source/textUtils/wordSeg/wordSegUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,11 @@ def _separatorFlag(self) -> list[bool]:
@cached_property
def computedStrToEncodedOffsets(self) -> list[int]:
"""
Compute a list of offsets so that:
encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex]

We build an explicit mapping from original string indices to encoded indices
by marking separator positions in the encoded string and then assigning
each non-separator encoded slot to the next original-character index.
The returned list contains the delta (encodedIndex - strIndex) for each
original index.
The returned list contains the absolute encoded index for each original
character index.
"""
strLen = self.strLength

Expand Down Expand Up @@ -80,40 +77,46 @@ def encodedStringLength(self) -> int:
"""Returns the length of the string in its subclass-specific encoded representation."""
return len(self.encoded)

def _strOffsetToEncodedOffset(self, offset: int) -> int:
if offset == self.strLength:
return self.encodedStringLength
return self.computedStrToEncodedOffsets[offset]

def _encodedOffsetToStrOffset(self, offset: int) -> int:
if offset == self.encodedStringLength:
return self.strLength
return self.computedEncodedToStrOffsets[offset]

def strToEncodedOffsets(
self,
strStart: int,
strEnd: int | None = None,
raiseOnError: bool = False,
) -> int | tuple[int, int]:
super().strToEncodedOffsets(strStart, strEnd, raiseOnError)
if strStart == 0:
resultStart = 0
else:
resultStart = self.computedStrToEncodedOffsets[strStart]
strStart = max(0, min(strStart, self.strLength))
resultStart = self._strOffsetToEncodedOffset(strStart)
if strEnd is None:
return resultStart
elif strStart == strEnd:
strEnd = max(0, min(strEnd, self.strLength))
if strStart == strEnd:
return (resultStart, resultStart)
else:
resultEnd = self.computedStrToEncodedOffsets[strEnd]
return (resultStart, resultEnd)
resultEnd = self._strOffsetToEncodedOffset(strEnd)
return (resultStart, resultEnd)

def encodedToStrOffsets(
self,
encodedStart: int,
encodedEnd: int | None = None,
raiseOnError: bool = False,
) -> int | tuple[int]:
) -> int | tuple[int, int]:
super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError)
if encodedStart == 0:
resultStart = 0
else:
resultStart = self.computedEncodedToStrOffsets[encodedStart]
encodedStart = max(0, min(encodedStart, self.encodedStringLength))
resultStart = self._encodedOffsetToStrOffset(encodedStart)
if encodedEnd is None:
return resultStart
elif encodedStart == encodedEnd:
encodedEnd = max(0, min(encodedEnd, self.encodedStringLength))
if encodedStart == encodedEnd:
return (resultStart, resultStart)
else:
resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
return (resultStart, resultEnd)
resultEnd = self._encodedOffsetToStrOffset(encodedEnd)
return (resultStart, resultEnd)
17 changes: 17 additions & 0 deletions tests/unit/test_textInfos.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""Unit tests for the textInfos module, its submodules and classes."""

import unittest
from unittest.mock import patch
from .textProvider import BasicTextProvider, MockBlackBoxTextInfo
import textInfos
from textInfos.offsets import Offsets
Expand Down Expand Up @@ -192,6 +193,22 @@ def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self):
self.assertEqual(ti.offsets, (7, 7))


class TestWordSegFlag(unittest.TestCase):
class _UnknownWordSegConf:
def calculated(self):
return "unexpected"

def test_unknownWordSegConfigReturnsNoneAfterLogging(self):
obj = BasicTextProvider(text="abc")
ti = obj.makeTextInfo(Offsets(0, 0))
ti.wordSegConf = self._UnknownWordSegConf()

with patch("textInfos.offsets.log.error") as mockLogError:
self.assertIsNone(ti.wordSegFlag)

mockLogError.assert_called_once_with("Unknown word segmentation standard, 'unexpected'")


class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase):
THREE_CHARS = "012"
TEN_CHARS = "0123456789"
Expand Down
Loading