diff --git a/nvdaHelper/cppjieba/cppjieba.cpp b/nvdaHelper/cppjieba/cppjieba.cpp index d63aa1f130c..815f92f7f88 100644 --- a/nvdaHelper/cppjieba/cppjieba.cpp +++ b/nvdaHelper/cppjieba/cppjieba.cpp @@ -15,7 +15,11 @@ JiebaSingleton* JiebaSingleton::instance = nullptr; std::once_flag JiebaSingleton::initFlag; JiebaSingleton& JiebaSingleton::getInstance(const char* dictDir) { - // convert incoming C-string+length to std::string (handles dictDir == nullptr) + if (!dictDir) { + throw std::invalid_argument("JiebaSingleton::getInstance() requires a non-null dictionary path."); + } + + // convert incoming C-string to std::string before entering call_once std::string dir = dictDir; // ensure singleton is constructed exactly once @@ -64,6 +68,8 @@ void JiebaSingleton::getWordEndOffsets(const std::string& text, std::vector extern "C" { bool initJieba(const char* dictDir) { + if (!dictDir) return false; + try { // simply force the singleton into existence (void)JiebaSingleton::getInstance(dictDir); @@ -105,16 +111,34 @@ bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) { } } -bool insertUserWord(const char* word, int freq, const char* tag = cppjieba::UNKNOWN_TAG) { - return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag)); +bool insertUserWord(const char* word, int freq, const char* tag) { + if (!word || !tag) return false; + + try { + return JiebaSingleton::getInstance().InsertUserWord(string(word), freq, string(tag)); + } catch (...) { + return false; + } } -bool deleteUserWord(const char* word, const char* tag = cppjieba::UNKNOWN_TAG) { - return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag)); +bool deleteUserWord(const char* word, const char* tag) { + if (!word || !tag) return false; + + try { + return JiebaSingleton::getInstance().DeleteUserWord(string(word), string(tag)); + } catch (...) { + return false; + } } bool find(const char* word) { - return JiebaSingleton::getInstance().Find(string(word)); + if (!word) return false; + + try { + return JiebaSingleton::getInstance().Find(string(word)); + } catch (...) { + return false; + } } void freeOffsets(int* ptr) { diff --git a/projectDocs/issues/githubIssueTemplateExplanationAndExamples.md b/projectDocs/issues/githubIssueTemplateExplanationAndExamples.md index 797699ec70a..b8b9494084d 100644 --- a/projectDocs/issues/githubIssueTemplateExplanationAndExamples.md +++ b/projectDocs/issues/githubIssueTemplateExplanationAndExamples.md @@ -27,8 +27,8 @@ We currently have the following templates: * Please note that these are reported differently, for more information refer to our [disclosure policy/procedure](https://github.com/nvaccess/nvda/blob/master/security.md) * Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations). These include: - * NVDA interface text that is incorrect in languages other than English - * Contents of the User Guide and Changes documents that are incorrect in languages other than English + * NVDA interface text that is incorrect in languages other than English + * Contents of the User Guide and Changes documents that are incorrect in languages other than English * Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English These templates are fillable forms that guide you through the process of providing the necessary information for your issue. diff --git a/projectDocs/issues/readme.md b/projectDocs/issues/readme.md index 7ae9903050a..3f606b94a5e 100644 --- a/projectDocs/issues/readme.md +++ b/projectDocs/issues/readme.md @@ -10,8 +10,8 @@ Do not report security concerns via GitHub issues, instead follow our [security Issues with materials handled by translators should be reported to the [NVDA Translators list](https://groups.io/g/nvda-translations). These include: -* NVDA interface text that is incorrect in languages other than English -* Contents of the User Guide and Changes documents that are incorrect in languages other than English +* NVDA interface text that is incorrect in languages other than English +* Contents of the User Guide and Changes documents that are incorrect in languages other than English * Input gestures, punctuation/symbol pronunciations, and character descriptions in languages other than English If you are reporting an issue with an application or website, please consider reporting the issue to the [authors of the application/website](./thirdPartyReporting.md) first. diff --git a/source/braille.py b/source/braille.py index 9752bdc2293..ae9250e6e08 100644 --- a/source/braille.py +++ b/source/braille.py @@ -600,10 +600,23 @@ def update(self): if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None: mode |= louis.compbrlAtCursor - converter: OffsetConverter | None = None + converters: list[OffsetConverter] = [] textToTranslate = self.rawText textToTranslateTypeforms = self.rawTextTypeforms cursorPos = self.cursorPos + + def _applyConverter(converter: OffsetConverter) -> None: + nonlocal cursorPos, textToTranslate, textToTranslateTypeforms + if textToTranslateTypeforms is not None: + textToTranslateTypeforms = [ + textToTranslateTypeforms[converter.encodedToStrOffsets(encodedOffset)] + for encodedOffset in range(converter.encodedStringLength) + ] + if cursorPos is not None: + cursorPos = converter.strToEncodedOffsets(cursorPos) + textToTranslate = converter.encoded + converters.append(converter) + if ( config.conf["braille"]["translationTable"].startswith("zh") or config.conf["braille"]["translationTable"] == "auto" @@ -611,21 +624,9 @@ def update(self): ): from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter # noqa: F401 - converter = WordSegWithSeparatorOffsetConverter(textToTranslate) - textToTranslate = converter.encoded - if cursorPos is not None: - cursorPos = converter.strToEncodedOffsets(cursorPos) + _applyConverter(WordSegWithSeparatorOffsetConverter(textToTranslate)) if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(textToTranslate): - converter = UnicodeNormalizationOffsetConverter(textToTranslate) - textToTranslate = converter.encoded - if textToTranslateTypeforms is not None: - # Typeforms must be adapted to represent normalized characters. - textToTranslateTypeforms = [ - textToTranslateTypeforms[strOffset] for strOffset in converter.computedEncodedToStrOffsets - ] - if cursorPos is not None: - # Convert the cursor position to a normalized offset. - cursorPos = converter.strToEncodedOffsets(cursorPos) + _applyConverter(UnicodeNormalizationOffsetConverter(textToTranslate)) self.brailleCells, brailleToRawPos, rawToBraillePos, self.brailleCursorPos = louisHelper.translate( [handler.table.fileName, "braille-patterns.cti"], textToTranslate, @@ -634,13 +635,13 @@ def update(self): cursorPos=cursorPos, ) - if converter: - # The received brailleToRawPos contains braille to normalized positions. - # Process them to represent real raw positions by converting them from normalized ones. + for converter in reversed(converters): + # Convert liblouis offsets from the most recently transformed text + # back through each transformation to the original raw text. brailleToRawPos = [converter.encodedToStrOffsets(i) for i in brailleToRawPos] - # The received rawToBraillePos contains normalized to braille positions. - # Create a new list based on real raw positions. - rawToBraillePos = [rawToBraillePos[i] for i in converter.computedStrToEncodedOffsets] + rawToBraillePos = [ + rawToBraillePos[converter.strToEncodedOffsets(i)] for i in range(converter.strLength) + ] self.brailleToRawPos = brailleToRawPos self.rawToBraillePos = rawToBraillePos diff --git a/source/config/configSpec.py b/source/config/configSpec.py index ab1d872b61a..a68e43f6307 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -280,6 +280,7 @@ reportClickable = boolean(default=true) [documentNavigation] + # Hidden option to eagerly initialize Chinese word segmentation even when the current languages do not use it. initWordSegForUnusedLang = boolean(default=false) wordSegmentationStandard = featureFlag(optionsEnum="WordNavigationUnitFlag", behaviorOfDefault="Auto") paragraphStyle = featureFlag(optionsEnum="ParagraphNavigationFlag", behaviorOfDefault="application") diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 11d8c297314..2241664d5a8 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -171,7 +171,8 @@ def wordSegFlag(self) -> WordSegFlag | None: case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: return WordSegFlag.CHINESE case _: - log.error(f"Unknown word segmentation standard, {self.__wordSegConf.calculated()!r}") + log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}") + return None #: The encoding internal to the underlying text info implementation. encoding: Optional[str] = textUtils.WCHAR_ENCODING diff --git a/source/textUtils/__init__.py b/source/textUtils/__init__.py index edc6757dc9c..fe17d531283 100644 --- a/source/textUtils/__init__.py +++ b/source/textUtils/__init__.py @@ -602,7 +602,9 @@ def __init__(self, text: str, encoding: str = "UTF-8", wordSegFlag: WordSegFlag self.wordSegFlag: WordSegFlag = wordSegFlag self.strategy: wordSegStrategy.WordSegmentationStrategy = self._chooseStrategy() - def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: optimize + def _chooseStrategy( + self, + ) -> wordSegStrategy.WordSegmentationStrategy: # TODO: Limit regex scans for large text. """Choose the appropriate segmentation strategy based on the text content.""" if self.wordSegFlag == WordSegFlag.AUTO: if ( @@ -627,6 +629,7 @@ def _chooseStrategy(self) -> wordSegStrategy.WordSegmentationStrategy: # TODO: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) case _: return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) + return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Get the segment containing the given offset.""" diff --git a/source/textUtils/wordSeg/__init__.py b/source/textUtils/wordSeg/__init__.py index 77231b58fa3..efdd28aa1ea 100644 --- a/source/textUtils/wordSeg/__init__.py +++ b/source/textUtils/wordSeg/__init__.py @@ -43,4 +43,3 @@ def initialize(): Thread(target=callable_to_call, args=args, kwargs=kwargs, daemon=True).start() except Exception as e: log.debug("Initializer %s.%s failed: %s", module_name, qualname, e) - return diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index 7c9351170ac..2ece489f394 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -4,12 +4,12 @@ # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt import os -import ctypes from ctypes import ( c_bool, c_char_p, c_int, create_string_buffer, + cdll, POINTER, byref, ) @@ -122,8 +122,8 @@ def _calculateUniscribeOffsets( helperFunc = NVDAHelper.localLib.calculateWordOffsets - relStart = ctypes.c_int() - relEnd = ctypes.c_int() + relStart = c_int() + relEnd = c_int() # uniscribe does some strange things # when you give it a string with not more than two alphanumeric chars in a row. # Inject two alphanumeric characters at the end to fix this @@ -139,8 +139,8 @@ def _calculateUniscribeOffsets( uniscribeLineText, uniscribeLineLength, relOffset, - ctypes.byref(relStart), - ctypes.byref(relEnd), + byref(relStart), + byref(relEnd), ): relStart = relStart.value relEnd = min(lineLength, relEnd.value) @@ -163,52 +163,56 @@ class ChineseWordSegmentationStrategy(WordSegmentationStrategy): @classmethod @initializerRegistry - def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternative + def _initCppJieba(cls, forceInit: bool = False): # TODO: Add a fallback when cppjieba.dll is unavailable. """ Class-level initializer: attempts to load the versioned cppjieba library and set up ctypes signatures. """ import config - if not forceInit and ( - cls._lib - or ( - config.conf["documentNavigation"]["wordSegmentationStandard"].calculated() - != config.featureFlagEnums.WordNavigationUnitFlag.CHINESE - and not cls.isUsingRelatedLanguage() - ) - ): + if cls._lib: return + + if not forceInit: + documentNavigationConf = config.conf["documentNavigation"] + shouldInit = ( + documentNavigationConf["wordSegmentationStandard"].calculated() + == config.featureFlagEnums.WordNavigationUnitFlag.CHINESE + or cls.isUsingRelatedLanguage() + or documentNavigationConf["initWordSegForUnusedLang"] + ) + if not shouldInit: + return try: from NVDAState import ReadPaths lib_path = os.path.join(ReadPaths.coreArchLibPath, "cppjieba.dll") - cls._lib = ctypes.cdll.LoadLibrary(lib_path) + lib = cdll.LoadLibrary(lib_path) # Setup function signatures # bool initJieba(const char* dictDir) - cls._lib.initJieba.restype = c_bool - cls._lib.initJieba.argtypes = [c_char_p] + lib.initJieba.restype = c_bool + lib.initJieba.argtypes = [c_char_p] # bool calculateWordOffsets(const char* text, int** wordEndOffsets, int* outLen) - cls._lib.calculateWordOffsets.restype = c_bool - cls._lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] + lib.calculateWordOffsets.restype = c_bool + lib.calculateWordOffsets.argtypes = [c_char_p, POINTER(POINTER(c_int)), POINTER(c_int)] # bool insertUserWord(const char* word, int freq, const char* tag) - cls._lib.insertUserWord.restype = c_bool - cls._lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p] + lib.insertUserWord.restype = c_bool + lib.insertUserWord.argtypes = [c_char_p, c_int, c_char_p] # bool deleteUserWord(const char* word, const char* tag) - cls._lib.deleteUserWord.restype = c_bool - cls._lib.deleteUserWord.argtypes = [c_char_p, c_char_p] + lib.deleteUserWord.restype = c_bool + lib.deleteUserWord.argtypes = [c_char_p, c_char_p] # bool find(const char* word) - cls._lib.find.restype = c_bool - cls._lib.find.argtypes = [c_char_p] + lib.find.restype = c_bool + lib.find.argtypes = [c_char_p] # void freeOffsets(int* offsets) - cls._lib.freeOffsets.restype = None - cls._lib.freeOffsets.argtypes = [POINTER(c_int)] + lib.freeOffsets.restype = None + lib.freeOffsets.argtypes = [POINTER(c_int)] # Initialize with dictionary path import globalVars @@ -216,15 +220,19 @@ def _initCppJieba(cls, forceInit: bool = False): # TODO: make cppjieba alternat DICTS_DIR = os.path.join(globalVars.appDir, "cppjieba", "dicts") DICTS_DIR_BYTES = DICTS_DIR.encode("utf-8") dictDir = create_string_buffer(DICTS_DIR_BYTES) - cls._lib.initJieba(dictDir) + if not lib.initJieba(dictDir): + log.debugWarning("Failed to initialize cppjieba library with dictionary path: %s", DICTS_DIR) + cls._lib = None + return + cls._lib = lib except Exception as e: log.debugWarning("Failed to load cppjieba library: %s", e) cls._lib = None @lru_cache(maxsize=256) - def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: + def _callCppjiebaCached(self, text_utf8: bytes) -> list[int]: if self._lib is None: - return None + return [] charPtr = POINTER(c_int)() outLen = c_int(0) @@ -232,7 +240,7 @@ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: try: success: bool = self._lib.calculateWordOffsets(text_utf8, byref(charPtr), byref(outLen)) if not success or not bool(charPtr) or outLen.value <= 0: - return None + return [] try: n = outLen.value @@ -245,14 +253,14 @@ def _callCppjiebaCached(self, text_utf8: bytes) -> list[int] | None: try: if bool(charPtr): self._lib.freeOffsets(charPtr) - except Exception: - pass - return None + except Exception as cleanupErr: + log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr) + return [] - def _callCPPJieba(self) -> list[int] | None: + def _callCPPJieba(self) -> list[int]: """ Instance method: encode self.text and call cppjieba. - Returns list[int] on success, None on failure. + Returns list[int] on success, or an empty list on failure. Uses LRU cache keyed by utf-8 bytes. """ data = self.text.encode("utf-8") @@ -261,14 +269,14 @@ def _callCPPJieba(self) -> list[int] | None: return self._callCppjiebaCached(data) else: if self._lib is None: - return None + return [] charPtr = POINTER(c_int)() outLen = c_int(0) try: success: bool = self._lib.calculateWordOffsets(data, byref(charPtr), byref(outLen)) if not success or not bool(charPtr) or outLen.value <= 0: - return None + return [] try: n = outLen.value @@ -280,9 +288,9 @@ def _callCPPJieba(self) -> list[int] | None: try: if bool(charPtr): self._lib.freeOffsets(charPtr) - except Exception: - pass - return None + except Exception as cleanupErr: + log.debugWarning("Failed to free cppjieba offsets after error: %s", cleanupErr) + return [] def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: """Segments the text using the word end indices.""" @@ -317,9 +325,8 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> result += sep if newSepIndex is not None: newSepIndex.append(len(result) - len(sep)) - else: - # append the final trailing token after the loop - result += self.text[curIndex:postIndex] + # append the final trailing token after the loop + result += self.text[curIndex:postIndex] return result diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py index d26a26cd9ba..e88b3152ee9 100644 --- a/source/textUtils/wordSeg/wordSegUtils.py +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -29,14 +29,11 @@ def _separatorFlag(self) -> list[bool]: @cached_property def computedStrToEncodedOffsets(self) -> list[int]: """ - Compute a list of offsets so that: - encodedIndex = strIndex + relevantStrToEncodedOffsets[strIndex] - We build an explicit mapping from original string indices to encoded indices by marking separator positions in the encoded string and then assigning each non-separator encoded slot to the next original-character index. - The returned list contains the delta (encodedIndex - strIndex) for each - original index. + The returned list contains the absolute encoded index for each original + character index. """ strLen = self.strLength @@ -80,6 +77,16 @@ def encodedStringLength(self) -> int: """Returns the length of the string in its subclass-specific encoded representation.""" return len(self.encoded) + def _strOffsetToEncodedOffset(self, offset: int) -> int: + if offset == self.strLength: + return self.encodedStringLength + return self.computedStrToEncodedOffsets[offset] + + def _encodedOffsetToStrOffset(self, offset: int) -> int: + if offset == self.encodedStringLength: + return self.strLength + return self.computedEncodedToStrOffsets[offset] + def strToEncodedOffsets( self, strStart: int, @@ -87,33 +94,29 @@ def strToEncodedOffsets( raiseOnError: bool = False, ) -> int | tuple[int, int]: super().strToEncodedOffsets(strStart, strEnd, raiseOnError) - if strStart == 0: - resultStart = 0 - else: - resultStart = self.computedStrToEncodedOffsets[strStart] + strStart = max(0, min(strStart, self.strLength)) + resultStart = self._strOffsetToEncodedOffset(strStart) if strEnd is None: return resultStart - elif strStart == strEnd: + strEnd = max(0, min(strEnd, self.strLength)) + if strStart == strEnd: return (resultStart, resultStart) - else: - resultEnd = self.computedStrToEncodedOffsets[strEnd] - return (resultStart, resultEnd) + resultEnd = self._strOffsetToEncodedOffset(strEnd) + return (resultStart, resultEnd) def encodedToStrOffsets( self, encodedStart: int, encodedEnd: int | None = None, raiseOnError: bool = False, - ) -> int | tuple[int]: + ) -> int | tuple[int, int]: super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError) - if encodedStart == 0: - resultStart = 0 - else: - resultStart = self.computedEncodedToStrOffsets[encodedStart] + encodedStart = max(0, min(encodedStart, self.encodedStringLength)) + resultStart = self._encodedOffsetToStrOffset(encodedStart) if encodedEnd is None: return resultStart - elif encodedStart == encodedEnd: + encodedEnd = max(0, min(encodedEnd, self.encodedStringLength)) + if encodedStart == encodedEnd: return (resultStart, resultStart) - else: - resultEnd = self.computedEncodedToStrOffsets[encodedEnd] - return (resultStart, resultEnd) + resultEnd = self._encodedOffsetToStrOffset(encodedEnd) + return (resultStart, resultEnd) diff --git a/tests/unit/test_braille/test_routing.py b/tests/unit/test_braille/test_routing.py index 32fff42b58d..2e4de958032 100644 --- a/tests/unit/test_braille/test_routing.py +++ b/tests/unit/test_braille/test_routing.py @@ -13,6 +13,7 @@ from ..textProvider import CursorManager, BasicTextProvider import unittest import time +from unittest.mock import Mock, patch from config.featureFlagEnums import ReviewRoutingMovesSystemCaretFlag @@ -31,6 +32,42 @@ class CursorManager(CursorManager): TextInfo = CursorManagerTextInfo +class TestBrailleOffsetConverters(unittest.TestCase): + def test_chineseWordSegmentationAndUnicodeNormalizationOffsetsAreComposed(self): + originalTranslationTable = config.conf["braille"]["translationTable"] + originalUnicodeNormalization = config.conf["braille"]["unicodeNormalization"] + config.conf["braille"]["translationTable"] = "zh-chn.ctb" + config.conf["braille"]["unicodeNormalization"] = "enabled" + + def segmentedText(sep, newSepIndex): + newSepIndex.append(1) + return "你 ℌ" + + wordSegmenter = Mock() + wordSegmenter.segmentedText.side_effect = segmentedText + translate = Mock(return_value=([1, 2, 3], [0, 1, 2], [0, 1, 2], 2)) + try: + with ( + patch("textUtils.wordSeg.wordSegUtils.WordSegmenter", return_value=wordSegmenter), + patch("braille.louisHelper.translate", translate), + ): + region = braille.Region() + region.rawText = "你ℌ" + region.rawTextTypeforms = [11, 22] + region.cursorPos = 1 + + region.update() + + self.assertEqual(translate.call_args.args[1], "你 H") + self.assertEqual(translate.call_args.kwargs["cursorPos"], 2) + self.assertEqual(translate.call_args.kwargs["typeform"], [11, 22, 22]) + self.assertEqual(region.brailleToRawPos, [0, 1, 1]) + self.assertEqual(region.rawToBraillePos, [0, 2]) + finally: + config.conf["braille"]["translationTable"] = originalTranslationTable + config.conf["braille"]["unicodeNormalization"] = originalUnicodeNormalization + + class TestReviewRoutingMovesSystemCaretInNavigableText(unittest.TestCase): """A test for the move system caret when routing review cursor braille setting when operating in navigable text with object review. diff --git a/tests/unit/test_textInfos.py b/tests/unit/test_textInfos.py index 882e58782c3..95cb05f36cf 100644 --- a/tests/unit/test_textInfos.py +++ b/tests/unit/test_textInfos.py @@ -8,6 +8,7 @@ """Unit tests for the textInfos module, its submodules and classes.""" import unittest +from unittest.mock import patch from .textProvider import BasicTextProvider, MockBlackBoxTextInfo import textInfos from textInfos.offsets import Offsets @@ -192,6 +193,22 @@ def test_expandWordAtEndOfStoryWithoutFlowsToDoesNothing(self): self.assertEqual(ti.offsets, (7, 7)) +class TestWordSegFlag(unittest.TestCase): + class _UnknownWordSegConf: + def calculated(self): + return "unexpected" + + def test_unknownWordSegConfigReturnsNoneAfterLogging(self): + obj = BasicTextProvider(text="abc") + ti = obj.makeTextInfo(Offsets(0, 0)) + ti.wordSegConf = self._UnknownWordSegConf() + + with patch("textInfos.offsets.log.error") as mockLogError: + self.assertIsNone(ti.wordSegFlag) + + mockLogError.assert_called_once_with("Unknown word segmentation standard, 'unexpected'") + + class TestMoveToCodepointOffsetInBlackBoxTextInfo(unittest.TestCase): THREE_CHARS = "012" TEN_CHARS = "0123456789" diff --git a/tests/unit/test_textUtils.py b/tests/unit/test_textUtils.py index 048c8580e78..7273abc6eaa 100644 --- a/tests/unit/test_textUtils.py +++ b/tests/unit/test_textUtils.py @@ -6,6 +6,8 @@ """Unit tests for the textUtils module.""" import unittest +from types import SimpleNamespace +from unittest.mock import Mock, patch from textUtils import UnicodeNormalizationOffsetConverter, WideStringOffsetConverter, WordSegmenter from textUtils.uniscribe import splitAtCharacterBoundaries @@ -445,6 +447,129 @@ def test_hebrew(self): self._testHelper("בְּרֵאשִׁית", ["בְּ", "רֵ", "א", "שִׁ", "י", "ת"]) +class TestChineseWordSegmentationInitialization(unittest.TestCase): + def _makeMockJiebaDll(self): + return SimpleNamespace( + initJieba=Mock(return_value=True), + calculateWordOffsets=Mock(), + insertUserWord=Mock(), + deleteUserWord=Mock(), + find=Mock(), + freeOffsets=Mock(), + ) + + def _setWordSegConfig(self, *, initForUnusedLang: bool): + import config + from config.featureFlag import FeatureFlag + from config.featureFlagEnums import WordNavigationUnitFlag + + originalInitForUnusedLang = config.conf["documentNavigation"]["initWordSegForUnusedLang"] + originalWordSegmentationStandard = config.conf["documentNavigation"]["wordSegmentationStandard"] + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = initForUnusedLang + config.conf["documentNavigation"]["wordSegmentationStandard"] = FeatureFlag( + WordNavigationUnitFlag.AUTO, + behaviorOfDefault=WordNavigationUnitFlag.AUTO, + ) + + def restoreConfig(): + config.conf["documentNavigation"]["initWordSegForUnusedLang"] = originalInitForUnusedLang + config.conf["documentNavigation"]["wordSegmentationStandard"] = originalWordSegmentationStandard + + return restoreConfig + + def test_doesNotInitializeForUnusedLanguageByDefault(self): + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + originalLib = ChineseWordSegmentationStrategy._lib + restoreConfig = self._setWordSegConfig(initForUnusedLang=False) + ChineseWordSegmentationStrategy._lib = None + try: + with ( + patch.object(ChineseWordSegmentationStrategy, "isUsingRelatedLanguage", return_value=False), + patch("textUtils.wordSeg.wordSegStrategy.cdll.LoadLibrary") as loadLibrary, + ): + ChineseWordSegmentationStrategy._initCppJieba() + + loadLibrary.assert_not_called() + finally: + ChineseWordSegmentationStrategy._lib = originalLib + restoreConfig() + + def test_initializesForUnusedLanguageWhenConfigured(self): + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + mockDll = self._makeMockJiebaDll() + originalLib = ChineseWordSegmentationStrategy._lib + restoreConfig = self._setWordSegConfig(initForUnusedLang=True) + ChineseWordSegmentationStrategy._lib = None + try: + with ( + patch.object(ChineseWordSegmentationStrategy, "isUsingRelatedLanguage", return_value=False), + patch( + "textUtils.wordSeg.wordSegStrategy.cdll.LoadLibrary", + return_value=mockDll, + ) as loadLibrary, + ): + ChineseWordSegmentationStrategy._initCppJieba() + + loadLibrary.assert_called_once() + mockDll.initJieba.assert_called_once() + finally: + ChineseWordSegmentationStrategy._lib = originalLib + restoreConfig() + + def test_forceInitStillInitializesForUnusedLanguage(self): + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + mockDll = self._makeMockJiebaDll() + originalLib = ChineseWordSegmentationStrategy._lib + restoreConfig = self._setWordSegConfig(initForUnusedLang=False) + ChineseWordSegmentationStrategy._lib = None + try: + with ( + patch.object(ChineseWordSegmentationStrategy, "isUsingRelatedLanguage", return_value=False), + patch( + "textUtils.wordSeg.wordSegStrategy.cdll.LoadLibrary", + return_value=mockDll, + ) as loadLibrary, + ): + ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) + + loadLibrary.assert_called_once() + mockDll.initJieba.assert_called_once() + finally: + ChineseWordSegmentationStrategy._lib = originalLib + restoreConfig() + + def test_initFailureDisablesCppJieba(self): + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + mockDll = self._makeMockJiebaDll() + mockDll.initJieba.return_value = False + originalLib = ChineseWordSegmentationStrategy._lib + restoreConfig = self._setWordSegConfig(initForUnusedLang=False) + ChineseWordSegmentationStrategy._lib = None + try: + with ( + patch.object(ChineseWordSegmentationStrategy, "isUsingRelatedLanguage", return_value=False), + patch( + "textUtils.wordSeg.wordSegStrategy.cdll.LoadLibrary", + return_value=mockDll, + ) as loadLibrary, + patch("textUtils.wordSeg.wordSegStrategy.log.debugWarning") as debugWarning, + ): + ChineseWordSegmentationStrategy._initCppJieba(forceInit=True) + + loadLibrary.assert_called_once() + mockDll.initJieba.assert_called_once() + self.assertIsNone(ChineseWordSegmentationStrategy._lib) + debugWarning.assert_called_once() + self.assertIn("Failed to initialize cppjieba", debugWarning.call_args.args[0]) + finally: + ChineseWordSegmentationStrategy._lib = originalLib + restoreConfig() + + class TestWordSegmenter(unittest.TestCase): """Tests for the WordSegmenter class.""" @@ -468,3 +593,99 @@ def test_chinese(self): self.assertEqual(segmenter.getSegmentForOffset(2), (2, 4)) self.assertEqual(segmenter.getSegmentForOffset(3), (2, 4)) self.assertEqual(segmenter.getSegmentForOffset(4), (2, 4)) + + def test_chineseSegmentationFailureStoresEmptyWordEnds(self): + from textUtils.wordSeg.wordSegStrategy import ChineseWordSegmentationStrategy + + mockDll = SimpleNamespace( + calculateWordOffsets=Mock(return_value=False), + freeOffsets=Mock(), + ) + originalLib = ChineseWordSegmentationStrategy._lib + ChineseWordSegmentationStrategy._lib = mockDll + try: + strategy = ChineseWordSegmentationStrategy("你好世界") + self.assertEqual(strategy.wordEnds, []) + self.assertEqual(strategy.segmentedText(), "你好世界") + finally: + ChineseWordSegmentationStrategy._lib = originalLib + + +class TestWordSegInitialize(unittest.TestCase): + def test_runsAllRegisteredInitializers(self): + from textUtils import wordSeg + from textUtils.wordSeg import wordSegStrategy + + calls = [] + + def firstInitializer(): + calls.append("first") + + def secondInitializer(): + calls.append("second") + + class ImmediateThread: + def __init__(self, target, args=None, kwargs=None, daemon=False): + self.target = target + self.args = () if args is None else args + self.kwargs = {} if kwargs is None else kwargs + self.daemon = daemon + + def start(self): + self.target(*self.args, **self.kwargs) + + initializerList = [ + ("missingModule", "firstInitializer", firstInitializer, (), {}), + ("missingModule", "secondInitializer", secondInitializer, (), {}), + ] + with ( + patch.object(wordSegStrategy, "initializerList", initializerList), + patch("threading.Thread", ImmediateThread), + ): + wordSeg.initialize() + + self.assertEqual(calls, ["first", "second"]) + + +class TestWordSegWithSeparatorOffsetConverter(unittest.TestCase): + def _makeConverter(self): + from textUtils.wordSeg.wordSegUtils import WordSegWithSeparatorOffsetConverter + + def segmentedText(sep, newSepIndex): + newSepIndex.append(2) + return "ab cd" + + with patch( + "textUtils.wordSeg.wordSegUtils.WordSegmenter", + return_value=SimpleNamespace(segmentedText=segmentedText), + ): + return WordSegWithSeparatorOffsetConverter("abcd") + + def test_strToEncodedOffsetsMapsEndAndClampsOutOfRange(self): + converter = self._makeConverter() + + self.assertEqual(converter.strToEncodedOffsets(-1), 0) + self.assertEqual(converter.strToEncodedOffsets(2, 4), (3, 5)) + self.assertEqual(converter.strToEncodedOffsets(4), 5) + self.assertEqual(converter.strToEncodedOffsets(5, 6), (5, 5)) + + def test_strToEncodedOffsetsRaisesOnOutOfRangeWhenRequested(self): + converter = self._makeConverter() + + with self.assertRaises(IndexError): + converter.strToEncodedOffsets(5, raiseOnError=True) + + def test_encodedToStrOffsetsMapsEndAndClampsOutOfRange(self): + converter = self._makeConverter() + + self.assertEqual(converter.encodedToStrOffsets(-1), 0) + self.assertEqual(converter.encodedToStrOffsets(2), 2) + self.assertEqual(converter.encodedToStrOffsets(2, 3), (2, 2)) + self.assertEqual(converter.encodedToStrOffsets(5), 4) + self.assertEqual(converter.encodedToStrOffsets(6, 7), (4, 4)) + + def test_encodedToStrOffsetsRaisesOnOutOfRangeWhenRequested(self): + converter = self._makeConverter() + + with self.assertRaises(IndexError): + converter.encodedToStrOffsets(6, raiseOnError=True)