diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index 2ec883d290..e6e6ad3b70 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -95,3 +95,29 @@ export const standardizeKanji = { description: '萬 → 万', process: (str) => [str, convertVariants(str)], }; + +const WILDCARD_MAX_VARIANTS = 51; +const WILDCARD_CHAR = '~'; // U+FF5E fullwidth tilde + +/** @type {import('language').TextProcessor} */ +export const insertWildcard = { + name: 'Insert wildcard for grammar patterns', + description: 'いくら騒いでも → いくら~でも', + process: (str) => { + const chars = [...str]; + const n = chars.length; + if (n < 3) { return [str]; } + + /** @type {string[]} */ + const results = [str]; + for (let prefixLen = 1; prefixLen < n; prefixLen++) { + for (let suffixLen = 1; suffixLen < n - prefixLen; suffixLen++) { + const prefix = chars.slice(0, prefixLen).join(''); + const suffix = chars.slice(n - suffixLen).join(''); + results.push(prefix + WILDCARD_CHAR + suffix); + if (results.length >= WILDCARD_MAX_VARIANTS) { return results; } + } + } + return results; + }, +}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 613d4fc723..543dfccaae 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -46,6 +46,7 @@ import { collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, + insertWildcard, normalizeCJKCompatibilityCharacters, normalizeCombiningCharacters, standardizeKanji, @@ -355,6 +356,7 @@ const languageDescriptors = [ convertHiraganaToKatakana, collapseEmphaticSequences, standardizeKanji, + insertWildcard, }, languageTransforms: japaneseTransforms, }, diff --git a/test/language/japanese-text-preprocessors.test.js b/test/language/japanese-text-preprocessors.test.js new file mode 100644 index 0000000000..ff8c62e0e8 --- /dev/null +++ b/test/language/japanese-text-preprocessors.test.js @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2024-2026 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {describe, expect, test} from 'vitest'; +import {insertWildcard} from '../../ext/js/language/ja/japanese-text-preprocessors.js'; + +const {process} = insertWildcard; + +describe('insertWildcard', () => { + test('short input (1 char) returns only original', () => { + expect(process('あ')).toStrictEqual(['あ']); + }); + + test('short input (2 chars) returns only original', () => { + expect(process('あい')).toStrictEqual(['あい']); + }); + + test('3-char input produces 1 wildcard variant', () => { + const variants = process('あいう'); + expect(variants).toContain('あいう'); + expect(variants).toContain('あ~う'); + expect(variants).toHaveLength(2); + }); + + test('いくら騒いでも produces いくら~でも', () => { + const variants = process('いくら騒いでも'); + expect(variants).toContain('いくら騒いでも'); + expect(variants).toContain('いくら~でも'); + }); + + test('single-char prefix works for ば~ほど pattern', () => { + const variants = process('ば食べるほど'); + expect(variants).toContain('ば食べるほど'); + expect(variants).toContain('ば~ほど'); + }); + + test('しか~ない pattern works', () => { + const variants = process('しか言わない'); + expect(variants).toContain('しか言わない'); + expect(variants).toContain('しか~ない'); + }); + + test('wildcard character is fullwidth tilde U+FF5E', () => { + const variants = process('あいう'); + const wildcardVariant = variants.find((v) => v !== 'あいう'); + expect(wildcardVariant).toBe('あ\uFF5Eう'); + }); + + test('variant count for 5-char input', () => { + // n=5: prefixLen 1..3, for each suffixLen 1..(n-prefixLen-1) + // p=1: s=1,2,3 (3); p=2: s=1,2 (2); p=3: s=1 (1) = 6 variants + original + const variants = process('あいうえお'); + expect(variants).toHaveLength(7); + }); + + test('variant count for 7-char input', () => { + // (n-1)(n-2)/2 = 6*5/2 = 15 variants + original + const variants = process('あいうえおかき'); + expect(variants).toHaveLength(16); + }); + + test('variants are capped at 51 for long inputs', () => { + const longStr = 'あいうえおかきくけこさしすせそ'; // 15 chars + const variants = process(longStr); + expect(variants).toHaveLength(51); + }); + + test('empty string returns only original', () => { + expect(process('')).toStrictEqual(['']); + }); + + test('original string is always first', () => { + const input = 'いくら騒いでも'; + const variants = process(input); + expect(variants[0]).toBe(input); + }); +}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index cfee329e79..0060386d14 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -194,6 +194,7 @@ type AllTextProcessors = { convertHiraganaToKatakana: TextProcessor; collapseEmphaticSequences: TextProcessor; standardizeKanji: TextProcessor; + insertWildcard: TextProcessor; }; }; ka: Record;