From 661d4f1d0eb24161173817421e0ed4439dc55120 Mon Sep 17 00:00:00 2001 From: Autumn Skerritt Date: Fri, 13 Mar 2026 10:40:20 +0000 Subject: [PATCH 1/3] Add SQLite-backed dictionary storage --- CONTRIBUTING.md | 2 +- README.md | 2 +- dev/build-libs.js | 193 + dev/data/manifest-variants.json | 3 +- dev/data/zstd-dicts/jmdict.zdict | Bin 0 -> 16384 bytes dev/lib/zstd-wasm.js | 18 + ext/js/background/backend.js | 13 + ext/js/background/offscreen-proxy.js | 21 +- ext/js/background/offscreen.js | 14 +- ext/js/comm/api.js | 15 + ext/js/core/diagnostics-reporter.js | 26 + ext/js/data/anki-note-data-creator.js | 2 +- ext/js/dictionary/dictionary-database.js | 5568 +++++++++++++++-- ext/js/dictionary/dictionary-importer.js | 2297 ++++++- .../dictionary/dictionary-worker-handler.js | 11 + ext/js/dictionary/raw-term-content.js | 227 + ext/js/dictionary/sqlite-wasm.js | 688 ++ ext/js/dictionary/term-bank-wasm-parser.js | 678 ++ ext/js/dictionary/term-content-opfs-store.js | 914 +++ ext/js/dictionary/term-record-opfs-store.js | 1847 ++++++ ext/js/dictionary/term-record-wasm-encoder.js | 222 + ext/js/dictionary/wasm/term-bank-parser.c | 530 ++ ext/js/dictionary/wasm/term-record-encoder.c | 171 + ext/js/dictionary/zstd-term-content.js | 109 + ext/js/pages/settings/backup-controller.js | 90 +- ext/legal-npm.html | 2 +- ext/settings.html | 2 +- package-lock.json | 15 + package.json | 6 +- shell.nix | 5 +- test/core.test.js | 5 + test/data/database-test-cases.json | 6 +- .../translator-test-results-note-data1.json | 12 +- test/data/translator-test-results.json | 12 +- test/database.test.js | 1391 +++- test/dictionary-data.test.js | 1 - test/mocks/common.js | 42 +- test/term-content-opfs-store.test.js | 109 + test/utilities/database.js | 4 +- types/ext/api.d.ts | 10 + types/ext/dictionary-database.d.ts | 33 +- types/ext/dictionary-importer.d.ts | 22 + types/ext/offscreen.d.ts | 10 + 43 files changed, 14446 insertions(+), 902 deletions(-) create mode 100644 dev/data/zstd-dicts/jmdict.zdict create mode 100644 dev/lib/zstd-wasm.js create mode 100644 ext/js/core/diagnostics-reporter.js create mode 100644 ext/js/dictionary/raw-term-content.js create mode 100644 ext/js/dictionary/sqlite-wasm.js create mode 100644 ext/js/dictionary/term-bank-wasm-parser.js create mode 100644 ext/js/dictionary/term-content-opfs-store.js create mode 100644 ext/js/dictionary/term-record-opfs-store.js create mode 100644 ext/js/dictionary/term-record-wasm-encoder.js create mode 100644 ext/js/dictionary/wasm/term-bank-parser.c create mode 100644 ext/js/dictionary/wasm/term-record-encoder.c create mode 100644 ext/js/dictionary/zstd-term-content.js create mode 100644 test/term-content-opfs-store.test.js diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ac84d41557..a8806e9d17 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,7 @@ Below are a few guidelines to ensure contributions have a good level of quality ## Setup -Yomitan uses [Node.js](https://nodejs.org/) and [npm](https://www.npmjs.com/) tools for building and testing. +Yomitan uses [Node.js](https://nodejs.org/) and [npm](https://www.npmjs.com/) tools for building and testing. The build also compiles bundled dictionary wasm assets, so you need a wasm32-capable compiler such as `clang` or `zig` available on `PATH` (or pointed to by `YOMITAN_CLANG`/`CLANG`). After installing these, the development environment can be set up by running `npm ci` and subsequently `npm run build`. ## Testing diff --git a/README.md b/README.md index bb68d9e788..a49b415adc 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ Feel free to join us on the [Yomitan Discord](https://discord.gg/YkQrXW6TXF). ## Building Yomitan -1. Install [Node.js](https://nodejs.org/) and [npm](https://www.npmjs.com/). +1. Install [Node.js](https://nodejs.org/) and [npm](https://www.npmjs.com/). You also need a wasm32-capable compiler such as `clang` or `zig` on `PATH` to build the bundled dictionary wasm assets. If needed, set `YOMITAN_CLANG` or `CLANG` to the compiler binary you want Yomitan to use. 2. Run `npm ci` to set up the environment. diff --git a/dev/build-libs.js b/dev/build-libs.js index 72068bf47d..e9ccc89283 100644 --- a/dev/build-libs.js +++ b/dev/build-libs.js @@ -21,6 +21,8 @@ import standaloneCode from 'ajv/dist/standalone/index.js'; import esbuild from 'esbuild'; import fs from 'fs'; import {createRequire} from 'module'; +import {execFileSync} from 'node:child_process'; +import os from 'os'; import path from 'path'; import {fileURLToPath} from 'url'; import {parseJson} from './json.js'; @@ -29,6 +31,127 @@ const require = createRequire(import.meta.url); const dirname = path.dirname(fileURLToPath(import.meta.url)); const extDir = path.join(dirname, '..', 'ext'); +const dictionaryWasmTarget = 'wasm32-freestanding'; + +/** + * @typedef {{command: string, args?: string[]}} CompilerCommand + */ + +/** + * @param {string|undefined} value + * @returns {value is string} + */ +function isNonEmptyString(value) { + return typeof value === 'string' && value.length > 0; +} + +/** + * @param {string} command + * @returns {CompilerCommand} + */ +function createCompilerCommand(command) { + const name = path.basename(command).toLowerCase(); + return ( + name === 'zig' || name === 'zig.exe' ? + {command, args: ['cc']} : + {command} + ); +} + +/** + * @returns {CompilerCommand[]} + */ +function getWindowsWingetCompilerCommands() { + if (process.platform !== 'win32') { return []; } + + const localAppData = process.env.LOCALAPPDATA; + if (!isNonEmptyString(localAppData)) { return []; } + + const packagesDir = path.join(localAppData, 'Microsoft', 'WinGet', 'Packages'); + if (!fs.existsSync(packagesDir)) { return []; } + + /** @type {CompilerCommand[]} */ + const commands = []; + for (const pkgEntry of fs.readdirSync(packagesDir, {withFileTypes: true})) { + if (!pkgEntry.isDirectory() || !pkgEntry.name.toLowerCase().startsWith('zig.zig')) { continue; } + const packageDir = path.join(packagesDir, pkgEntry.name); + for (const versionEntry of fs.readdirSync(packageDir, {withFileTypes: true})) { + if (!versionEntry.isDirectory()) { continue; } + const zigPath = path.join(packageDir, versionEntry.name, 'zig.exe'); + if (fs.existsSync(zigPath)) { + commands.push(createCompilerCommand(zigPath)); + } + } + } + return commands; +} + +/** + * @param {CompilerCommand} compiler + * @returns {boolean} + */ +function canBuildWasmTarget(compiler) { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'yomitan-wasm-')); + const sourcePath = path.join(tempDir, 'probe.c'); + const outputPath = path.join(tempDir, 'probe.wasm'); + try { + fs.writeFileSync(sourcePath, 'void probe(void) {}\n', 'utf8'); + execFileSync( + compiler.command, + [ + ...(compiler.args ?? []), + `--target=${dictionaryWasmTarget}`, + '-nostdlib', + '-Wl,--no-entry', + '-Wl,--export=probe', + '-Wl,--strip-all', + '-o', + outputPath, + sourcePath, + ], + {stdio: 'ignore'}, + ); + return true; + } catch { + return false; + } finally { + fs.rmSync(tempDir, {recursive: true, force: true}); + } +} + +/** + * @returns {CompilerCommand} + * @throws {Error} + */ +function getWasmCapableCompiler() { + const candidates = /** @type {CompilerCommand[]} */ ([ + process.env.YOMITAN_CLANG, + process.env.CLANG, + 'clang', + 'clang-18', + 'clang-17', + 'zig', + '/opt/homebrew/opt/llvm/bin/clang', + '/usr/bin/clang', + ...getWindowsWingetCompilerCommands().map(({command}) => command), + ] + .filter(isNonEmptyString) + .map((value) => createCompilerCommand(value))); + for (const candidate of candidates) { + try { + execFileSync(candidate.command, [...(candidate.args ?? []), '--version'], {stdio: 'ignore'}); + } catch { + continue; + } + if (canBuildWasmTarget(candidate)) { + return candidate; + } + } + throw new Error( + 'Missing a wasm32-capable compiler required to build dictionary wasm assets. ' + + `Set YOMITAN_CLANG or CLANG to a compiler that can link --target=${dictionaryWasmTarget}, such as clang or zig.`, + ); +} /** * @param {string} out @@ -40,6 +163,73 @@ async function copyWasm(out) { fs.copyFileSync(wasmPath, path.join(out, 'resvg.wasm')); } +/** + * @param {string} out + */ +async function copySqliteWasm(out) { + const sqliteWasmPath = path.dirname(require.resolve('@sqlite.org/sqlite-wasm/package.json')); + const sqliteDistPath = path.join(sqliteWasmPath, 'dist'); + const sqliteOutPath = path.join(out, 'sqlite'); + fs.mkdirSync(sqliteOutPath, {recursive: true}); + for (const fileName of fs.readdirSync(sqliteDistPath)) { + const source = path.join(sqliteDistPath, fileName); + const destination = path.join(sqliteOutPath, fileName); + fs.copyFileSync(source, destination); + } +} + +/** + * @param {string} out + */ +async function copyZstdAssets(out) { + const zstdEntryPath = require.resolve('@bokuweb/zstd-wasm'); + const zstdPkgPath = path.resolve(path.dirname(zstdEntryPath), '..', '..'); + const zstdWasmPath = path.join(zstdPkgPath, 'dist/esm/zstd.wasm'); + fs.copyFileSync(zstdWasmPath, path.join(out, 'zstd.wasm')); + + const zstdDictOutPath = path.join(out, 'zstd-dicts'); + fs.mkdirSync(zstdDictOutPath, {recursive: true}); + const jmdictDictPath = path.join(dirname, 'data', 'zstd-dicts', 'jmdict.zdict'); + if (!fs.existsSync(jmdictDictPath)) { + throw new Error(`Missing vendored zstd dictionary asset: ${jmdictDictPath}`); + } + fs.copyFileSync(jmdictDictPath, path.join(zstdDictOutPath, 'jmdict.zdict')); +} + +/** + * @param {string} out + */ +async function buildDictionaryWasm(out) { + const wasmSources = [ + { + sourcePath: path.join(extDir, 'js', 'dictionary', 'wasm', 'term-bank-parser.c'), + outputPath: path.join(out, 'term-bank-parser.wasm'), + exports: ['wasm_reset_heap', 'wasm_alloc', 'parse_term_bank', 'encode_term_content'], + }, + { + sourcePath: path.join(extDir, 'js', 'dictionary', 'wasm', 'term-record-encoder.c'), + outputPath: path.join(out, 'term-record-encoder.wasm'), + exports: ['wasm_reset_heap', 'wasm_alloc', 'calc_encoded_size', 'encode_records'], + }, + ]; + + const compiler = getWasmCapableCompiler(); + + for (const target of wasmSources) { + const args = [ + `--target=${dictionaryWasmTarget}`, + '-O3', + '-nostdlib', + '-Wl,--no-entry', + ]; + for (const exportName of target.exports) { + args.push(`-Wl,--export=${exportName}`); + } + args.push('-Wl,--strip-all', '-o', target.outputPath, target.sourcePath); + execFileSync(compiler.command, [...(compiler.args ?? []), ...args], {stdio: 'inherit'}); + } +} + /** * @param {string} scriptPath @@ -95,4 +285,7 @@ export async function buildLibs() { fs.writeFileSync(path.join(extDir, 'lib/validate-schemas.js'), patchedModuleCode); await copyWasm(path.join(extDir, 'lib')); + await copySqliteWasm(path.join(extDir, 'lib')); + await copyZstdAssets(path.join(extDir, 'lib')); + await buildDictionaryWasm(path.join(extDir, 'lib')); } diff --git a/dev/data/manifest-variants.json b/dev/data/manifest-variants.json index 5b7e1bdb80..81ebac9a67 100644 --- a/dev/data/manifest-variants.json +++ b/dev/data/manifest-variants.json @@ -106,7 +106,8 @@ "popup.html", "template-renderer.html", "js/*", - "lib/resvg.wasm" + "lib/resvg.wasm", + "lib/sqlite/*" ], "matches": [ "" diff --git a/dev/data/zstd-dicts/jmdict.zdict b/dev/data/zstd-dicts/jmdict.zdict new file mode 100644 index 0000000000000000000000000000000000000000..5c964dbe65205eb5245f79379173a208005640d4 GIT binary patch literal 16384 zcmeHOUyLM08DHWl7ebIk6A4jMbcCBr_AUoRNNz(^LPQfV8qNnjiK*$XnJKorYpCkp zxxw7SEVpn6hag7+h}>~G)5mZ#uwjwaECv4i3t~DqQ3e2YG!AvW_o)6 z?+s|eay{K&Re$ySzWVC>*EgQo_xGQi_~zWLmp>Cddf9Kz|KcxKUiG(cfB)Zq`t$TF z*X`SJ;cshu9 z#{I#m%7mduF;9u^E{-84YsZ4#yt@+9%i^v(IOUgPB*X!tzu!A$-@TY#)=FNO;20~ zhzizF+6;3w^h$_7sK?3}uG@~}L<&yFN-mT-giJlbO*;@fm#&b4N-UvWJdWE0)nNJ> zw9hIAc#&AL440=iO`g)P-dc7UMr#a6OG-6SrZsizT9>tJ@yM(1zVh$}RC3(HGo`Sl zB9@3qhO?}4JN?V+iZxZ-PLysoc!}~fxYX_bSWkgm%!6vKSr{v9RF7sv$u>{o>)NRf zL_w@bCasoKd1hvy|5;-Q)Rx;?ae*j2%cZqa`^C=ONbZ+}G{A-t9bA4oxg-SqU)R2OeEDYmv`Qu=w3));)G3 z?_JAFSBd5Pe7Ey36wDsUE#BUZ(_7<&#jBh{%MnG8X+9_7Sf+K1(JHSu#?v)zqYjJijqN?aWES2-Z61-{o|f(LNwh37$U-k zowg|m90UlIWY{jU+a5@iDUfMSI@yx&+{W%WU>Vwjw9Fi+su7xuD&|4LVEk-aBq=Ym z&03fe%U$J&55kgUgvoUivMptR%fTU+2p;>A_dp^yj!sv9zDd|Yu0P;j2Nk52c2g3n zwbW%%KR$V`w<*L!sNmWX=>D0npPOaR~9Ak#JW_ zBtgnG^nClJAWRdUqhZ|+{PEYj^AB`8&sVzh^Fte>Ag!9>En&KxG z+=uJVEg=8IDkRw5GuE$;+cV6U){_mzQ)zPwwL>HWbrJ96IJ-=Ua zW1gY$z*Q=u;z2XdtP?tocpA|Q#l_P|wwgT7%!;m6jn65~`}hr#?}e(jl?8<$H~C&B zQyb&ski~-Lh7@D#O}Ybaq5V4hz{^Mg*)-6Px}75T9KViaQ`5e)mE`xQ!?f{7_dBAi&$FHs!JKfnzi056a-FCO<=y$vtEU>uTTwG zd}0B77mge$v7j2TCcZjvbm#8}gZ&qb7Na)`KQ#&9v}BY>kwby9w|_y$jmZN`a(aXkpv?SO zZj!A^qQZt8?1yoUC|hb#OhV-eEkl&9(L#bkLxfHltrAv`a~N|-z&>4(%gt<3aFx>0 zxI*vbotIw%Z+@7Gd+YW>tV~$NUp+OjP}75E@GvC&Oq+3QGn|O1Z&A3FSnx=m%V64~ z|F*_XW89t#TiFg4PyNj6JaAq6;M^IqtiJqkuULLPV^7@%!Mnda>P;FT8`ABZ9+|6m zdll0iqeh(3)`mg3?v%%AH$y`@?IO}gaXD<*DY-z4b95#@k zj3(GBCNk*A60Cv|HeWqQ2AZsECJ;(d)iM$=3f8-$Ad4zcnas#Q)ce0fNhRN*eAE!b z#IEJ0pPfMa2O5YPC_ee)Ce3Dh3Ttg;BR8{t#@V%5LD=B7Qzqj$?BEW#6F8W-w}Yxm zs;nCG(%d>dW)*9nQKb@xnrI6{;OoQZv0@s8aSJthPnRYE;&Dho3p44n>n+KuKGmRP zx$;9xYcegLs4CNvNCj`I0dd<0v>X-vA3G%Z0nPgmJ1T8qgWEkOM5L0Eoz+uQbmz8E zq_kxcoui4B>|s5RZEkAiL=yVt=iViM|duJI~V%agNb)|Tk%NN|>%EkxE5eA^UejFmnZ+<|Up zWO|0%N~tZF6aRf*3LTY8sy*llgPZ|rGxOpZSTD>S*yowzG~e}J@vx4<*b-WE8N)hS&KPfi#dk}m=1of^81@H zRxX&!f>$;;`p45a!>ZOEk085RZ8z|1LT>l@|2D-YEJ<_ zlZMV!K^ zSFrBmh81z-GC>uX!uM_6z{R!M{B?5+9bGS=ezUA-#+y+xbwYJ_hGSjIhq_D{qD;{fK~3`u{?y7U$q^p6s^t_ zPG4}pa-y1v_?(Q8T}6XKnfrimYM@xYM!A}Y)-PR_I27ref0`C+->Au2QB2C y(L9e2gR. + */ + +export * from '@bokuweb/zstd-wasm'; diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index dedaeb3af1..9c603bb564 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -170,6 +170,8 @@ export class Backend { ['getZoom', this._onApiGetZoom.bind(this)], ['getDefaultAnkiFieldTemplates', this._onApiGetDefaultAnkiFieldTemplates.bind(this)], ['getDictionaryInfo', this._onApiGetDictionaryInfo.bind(this)], + ['exportDictionaryDatabase', this._onApiExportDictionaryDatabase.bind(this)], + ['importDictionaryDatabase', this._onApiImportDictionaryDatabase.bind(this)], ['purgeDatabase', this._onApiPurgeDatabase.bind(this)], ['getMedia', this._onApiGetMedia.bind(this)], ['logGenericErrorBackend', this._onApiLogGenericErrorBackend.bind(this)], @@ -927,6 +929,17 @@ export class Backend { return await this._dictionaryDatabase.getDictionaryInfo(); } + /** @type {import('api').ApiHandler<'exportDictionaryDatabase'>} */ + async _onApiExportDictionaryDatabase() { + return await this._dictionaryDatabase.exportDatabase(); + } + + /** @type {import('api').ApiHandler<'importDictionaryDatabase'>} */ + async _onApiImportDictionaryDatabase({content}) { + await this._dictionaryDatabase.importDatabase(content); + this._triggerDatabaseUpdated('dictionary', 'import'); + } + /** @type {import('api').ApiHandler<'purgeDatabase'>} */ async _onApiPurgeDatabase() { await this._dictionaryDatabase.purge(); diff --git a/ext/js/background/offscreen-proxy.js b/ext/js/background/offscreen-proxy.js index fb9ebb9cf5..2a689440db 100644 --- a/ext/js/background/offscreen-proxy.js +++ b/ext/js/background/offscreen-proxy.js @@ -18,7 +18,7 @@ import {ExtensionError} from '../core/extension-error.js'; import {isObjectNotArray} from '../core/object-utilities.js'; -import {base64ToArrayBuffer} from '../data/array-buffer-util.js'; +import {arrayBufferToBase64, base64ToArrayBuffer} from '../data/array-buffer-util.js'; /** * This class is responsible for creating and communicating with an offscreen document. @@ -190,6 +190,25 @@ export class DictionaryDatabaseProxy { return this._offscreen.sendMessagePromise({action: 'getDictionaryInfoOffscreen'}); } + /** + * @returns {Promise} + */ + async exportDatabase() { + const content = await this._offscreen.sendMessagePromise({action: 'exportDictionaryDatabaseOffscreen'}); + return base64ToArrayBuffer(content); + } + + /** + * @param {ArrayBuffer} content + * @returns {Promise} + */ + async importDatabase(content) { + await this._offscreen.sendMessagePromise({ + action: 'importDictionaryDatabaseOffscreen', + params: {content: arrayBufferToBase64(content)}, + }); + } + /** * @returns {Promise} */ diff --git a/ext/js/background/offscreen.js b/ext/js/background/offscreen.js index 4dea9930ea..4e535069ea 100644 --- a/ext/js/background/offscreen.js +++ b/ext/js/background/offscreen.js @@ -22,7 +22,7 @@ import {createApiMap, invokeApiMapHandler} from '../core/api-map.js'; import {ExtensionError} from '../core/extension-error.js'; import {log} from '../core/log.js'; import {sanitizeCSS} from '../core/utilities.js'; -import {arrayBufferToBase64} from '../data/array-buffer-util.js'; +import {arrayBufferToBase64, base64ToArrayBuffer} from '../data/array-buffer-util.js'; import {DictionaryDatabase} from '../dictionary/dictionary-database.js'; import {WebExtension} from '../extension/web-extension.js'; import {Translator} from '../language/translator.js'; @@ -55,6 +55,8 @@ export class Offscreen { ['clipboardSetBrowserOffscreen', this._setClipboardBrowser.bind(this)], ['databasePrepareOffscreen', this._prepareDatabaseHandler.bind(this)], ['getDictionaryInfoOffscreen', this._getDictionaryInfoHandler.bind(this)], + ['exportDictionaryDatabaseOffscreen', this._exportDictionaryDatabaseHandler.bind(this)], + ['importDictionaryDatabaseOffscreen', this._importDictionaryDatabaseHandler.bind(this)], ['databasePurgeOffscreen', this._purgeDatabaseHandler.bind(this)], ['databaseGetMediaOffscreen', this._getMediaHandler.bind(this)], ['translatorPrepareOffscreen', this._prepareTranslatorHandler.bind(this)], @@ -117,6 +119,16 @@ export class Offscreen { return await this._dictionaryDatabase.getDictionaryInfo(); } + /** @type {import('offscreen').ApiHandler<'exportDictionaryDatabaseOffscreen'>} */ + async _exportDictionaryDatabaseHandler() { + return arrayBufferToBase64(await this._dictionaryDatabase.exportDatabase()); + } + + /** @type {import('offscreen').ApiHandler<'importDictionaryDatabaseOffscreen'>} */ + async _importDictionaryDatabaseHandler({content}) { + await this._dictionaryDatabase.importDatabase(base64ToArrayBuffer(content)); + } + /** @type {import('offscreen').ApiHandler<'databasePurgeOffscreen'>} */ async _purgeDatabaseHandler() { return await this._dictionaryDatabase.purge(); diff --git a/ext/js/comm/api.js b/ext/js/comm/api.js index f42b2dd449..e7573b802f 100644 --- a/ext/js/comm/api.js +++ b/ext/js/comm/api.js @@ -248,6 +248,21 @@ export class API { return this._invoke('getDictionaryInfo', void 0); } + /** + * @returns {Promise>} + */ + exportDictionaryDatabase() { + return this._invoke('exportDictionaryDatabase', void 0); + } + + /** + * @param {import('api').ApiParam<'importDictionaryDatabase', 'content'>} content + * @returns {Promise>} + */ + importDictionaryDatabase(content) { + return this._invoke('importDictionaryDatabase', {content}); + } + /** * @returns {Promise>} */ diff --git a/ext/js/core/diagnostics-reporter.js b/ext/js/core/diagnostics-reporter.js new file mode 100644 index 0000000000..4585e47ef5 --- /dev/null +++ b/ext/js/core/diagnostics-reporter.js @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * PR1 keeps only correctness-essential diagnostics paths. + * The richer diagnostics pipeline from the fork is intentionally not upstreamed here. + * @param {string} _event + * @param {unknown} [_payload] + */ +export function reportDiagnostics(_event, _payload = {}) { + // NOP +} diff --git a/ext/js/data/anki-note-data-creator.js b/ext/js/data/anki-note-data-creator.js index 23e3e3c4f6..763a65821a 100644 --- a/ext/js/data/anki-note-data-creator.js +++ b/ext/js/data/anki-note-data-creator.js @@ -586,7 +586,7 @@ function getTermDictionaryEntryCommonInfo(dictionaryEntry, type, dictionaryStyle uniqueTerms, uniqueReadings, definitionTags, - definitions: hasDefinitions ? definitions : void 0, + ...(hasDefinitions ? {definitions} : {}), }; } diff --git a/ext/js/dictionary/dictionary-database.js b/ext/js/dictionary/dictionary-database.js index 581cea3499..ca861604fd 100644 --- a/ext/js/dictionary/dictionary-database.js +++ b/ext/js/dictionary/dictionary-database.js @@ -16,52 +16,228 @@ * along with this program. If not, see . */ +/* eslint-disable-next-line @typescript-eslint/ban-ts-comment */ +// @ts-nocheck + import {initWasm, Resvg} from '../../lib/resvg-wasm.js'; import {createApiMap, invokeApiMapHandler} from '../core/api-map.js'; +import {reportDiagnostics} from '../core/diagnostics-reporter.js'; import {ExtensionError} from '../core/extension-error.js'; +import {parseJson} from '../core/json.js'; import {log} from '../core/log.js'; import {safePerformance} from '../core/safe-performance.js'; +import {toError} from '../core/to-error.js'; import {stringReverse} from '../core/utilities.js'; -import {Database} from '../data/database.js'; +import {deleteOpfsDatabaseFiles, didLastOpenUseFallbackStorage, getLastOpenStorageDiagnostics, getSqlite3, importOpfsDatabase, openOpfsDatabase} from './sqlite-wasm.js'; +import { + compressTermContentZstd, + decompressTermContentZstd, + initializeTermContentZstd, + logTermContentZstdError, + resolveTermContentZstdDictName, +} from './zstd-term-content.js'; +import { + decodeRawTermContentHeader, + RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME, + decodeRawTermContentSharedGlossaryHeader, + encodeRawTermContentBinary, + getRawTermContentGlossaryJsonBytes, + isRawTermContentBinary, + isRawTermContentSharedGlossaryBinary, + RAW_TERM_CONTENT_DICT_NAME, + RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME, +} from './raw-term-content.js'; +import {decompress as zstdDecompress} from '../../lib/zstd-wasm.js'; +import {TermContentOpfsStore} from './term-content-opfs-store.js'; +import {TermRecordOpfsStore} from './term-record-opfs-store.js'; + +const CURRENT_DICTIONARY_SCHEMA_VERSION = 4; +const TERM_ENTRY_CONTENT_CACHE_MAX_ENTRIES = 4096; +const DEFAULT_STATEMENT_CACHE_MAX_ENTRIES = 256; +const LOW_MEMORY_STATEMENT_CACHE_MAX_ENTRIES = 128; +const DEFAULT_TERM_EXACT_PRESENCE_CACHE_MAX_ENTRIES = 25000; +const LOW_MEMORY_TERM_EXACT_PRESENCE_CACHE_MAX_ENTRIES = 8000; +const TERM_BULK_ADD_STAGING_MAX_ROWS = 3000; +const DEFAULT_TERM_BULK_ADD_STAGING_MAX_ROWS = 4096; +const HIGH_MEMORY_TERM_BULK_ADD_STAGING_MAX_ROWS = 10240; +const TERM_CONTENT_STORAGE_MODE_BASELINE = 'baseline'; +const TERM_CONTENT_STORAGE_MODE_RAW_BYTES = 'raw-bytes'; +const DEFAULT_RAW_TERM_CONTENT_PACK_TARGET_BYTES = 4 * 1024 * 1024; + +/** + * @param {string} value + * @returns {[number, number]|null} + */ +function parseContentHashHexPair(value) { + if (value.length !== 16) { return null; } + const hash1 = Number.parseInt(value.slice(0, 8), 16); + const hash2 = Number.parseInt(value.slice(8, 16), 16); + if (!Number.isFinite(hash1) || !Number.isFinite(hash2)) { return null; } + return [hash1 >>> 0, hash2 >>> 0]; +} + +/** + * @param {Uint8Array[]} chunks + * @param {number} targetBytes + * @returns {{packedChunks: Uint8Array[], sourceChunkIndices: number[], sourceChunkLocalOffsets: number[]}} + */ +function packContentChunksIntoSlabs(chunks, targetBytes) { + /** @type {Uint8Array[]} */ + const packedChunks = []; + /** @type {number[]} */ + const sourceChunkIndices = new Array(chunks.length); + /** @type {number[]} */ + const sourceChunkLocalOffsets = new Array(chunks.length); + let startIndex = 0; + while (startIndex < chunks.length) { + let totalBytes = 0; + let endIndex = startIndex; + while (endIndex < chunks.length) { + const nextBytes = chunks[endIndex].byteLength; + if (totalBytes > 0 && (totalBytes + nextBytes) > targetBytes) { + break; + } + totalBytes += nextBytes; + ++endIndex; + } + if (totalBytes <= 0) { + sourceChunkIndices[startIndex] = packedChunks.length; + sourceChunkLocalOffsets[startIndex] = 0; + packedChunks.push(chunks[startIndex]); + ++startIndex; + continue; + } + const packedIndex = packedChunks.length; + const packed = new Uint8Array(totalBytes); + let offset = 0; + for (let i = startIndex; i < endIndex; ++i) { + const chunk = chunks[i]; + sourceChunkIndices[i] = packedIndex; + sourceChunkLocalOffsets[i] = offset; + packed.set(chunk, offset); + offset += chunk.byteLength; + } + packedChunks.push(packed); + startIndex = endIndex; + } + return {packedChunks, sourceChunkIndices, sourceChunkLocalOffsets}; +} + + +/** + * @typedef {object} InsertStatement + * @property {string} sql + * @property {(item: unknown) => import('@sqlite.org/sqlite-wasm').BindingSpec} bind + */ export class DictionaryDatabase { constructor() { - /** @type {Database} */ - this._db = new Database(); - /** @type {string} */ - this._dbName = 'dict'; - /** @type {import('dictionary-database').CreateQuery} */ - this._createOnlyQuery1 = (item) => IDBKeyRange.only(item); - /** @type {import('dictionary-database').CreateQuery} */ - this._createOnlyQuery2 = (item) => IDBKeyRange.only(item.query); - /** @type {import('dictionary-database').CreateQuery} */ - this._createOnlyQuery3 = (item) => IDBKeyRange.only(item.term); - /** @type {import('dictionary-database').CreateQuery} */ - this._createOnlyQuery4 = (item) => IDBKeyRange.only(item.path); - /** @type {import('dictionary-database').CreateQuery} */ - this._createOnlyQuery5 = (item) => IDBKeyRange.only(item.path); - /** @type {import('dictionary-database').CreateQuery} */ - this._createBoundQuery1 = (item) => IDBKeyRange.bound(item, `${item}\uffff`, false, false); - /** @type {import('dictionary-database').CreateQuery} */ - this._createBoundQuery2 = (item) => { - item = stringReverse(item); - return IDBKeyRange.bound(item, `${item}\uffff`, false, false); - }; - /** @type {import('dictionary-database').CreateResult} */ - this._createTermBind1 = this._createTermExact.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createTermBind2 = this._createTermSequenceExact.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createTermMetaBind = this._createTermMeta.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createKanjiBind = this._createKanji.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createKanjiMetaBind = this._createKanjiMeta.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createMediaBind = this._createMedia.bind(this); - /** @type {import('dictionary-database').CreateResult} */ - this._createDrawMediaBind = this._createDrawMedia.bind(this); - + /** @type {import('@sqlite.org/sqlite-wasm').Sqlite3Static|null} */ + this._sqlite3 = null; + /** @type {import('@sqlite.org/sqlite-wasm').Database|null} */ + this._db = null; + /** @type {boolean} */ + this._isOpening = false; + /** @type {boolean} */ + this._usesFallbackStorage = false; + /** @type {{mode: string, forceFallback: boolean, hasOpfsDbCtor: boolean, hasOpfsImportDb: boolean, hasWasmfsDir: boolean, attempts?: Array<{strategy: string, target: string, flags: string, error: string}>, lastError?: string|null}|null} */ + this._openStorageDiagnostics = null; + /** @type {Record|null} */ + this._startupCleanupIncompleteImportsSummary = null; + /** @type {Record|null} */ + this._startupCleanupMissingTermRecordShardsSummary = null; + /** @type {number} */ + this._bulkImportDepth = 0; + /** @type {boolean} */ + this._bulkImportTransactionOpen = false; + /** @type {boolean} */ + this._deferTermsVirtualTableSync = false; + /** @type {boolean} */ + this._termsVirtualTableDirty = false; + /** @type {Map} */ + this._termEntryContentIdByKey = new Map(); + /** @type {Map} */ + this._termEntryContentIdByHash = new Map(); + /** @type {Map} */ + this._termEntryContentMetaByHash = new Map(); + /** @type {Map>} */ + this._termEntryContentMetaByHashPair = new Map(); + /** @type {boolean} */ + this._termEntryContentHasExistingRows = true; + /** @type {boolean} */ + this._enableTermEntryContentDedup = true; + /** @type {Map} */ + this._statementCache = new Map(); + /** @type {number} */ + this._statementCacheMaxEntries = this._computeStatementCacheMaxEntries(); + /** @type {Map} */ + this._termEntryContentCache = new Map(); + /** @type {Map} */ + this._sharedGlossaryArtifactMetaByDictionary = new Map(); + /** @type {Map} */ + this._sharedGlossaryArtifactInflatedByDictionary = new Map(); + /** @type {number} */ + this._termEntryContentCacheMaxEntries = TERM_ENTRY_CONTENT_CACHE_MAX_ENTRIES; + /** @type {TextEncoder} */ + this._textEncoder = new TextEncoder(); + /** @type {TextDecoder} */ + this._textDecoder = new TextDecoder(); + /** @type {boolean} */ + this._termContentZstdInitialized = false; + /** @type {'baseline'|'raw-bytes'} */ + this._termContentStorageMode = TERM_CONTENT_STORAGE_MODE_BASELINE; + /** @type {Map} */ + this._termExactPresenceCache = new Map(); + /** @type {number} */ + this._termExactPresenceCacheMaxEntries = this._computeTermExactPresenceCacheMaxEntries(); + /** @type {Map} */ + this._termPrefixNegativeCache = new Map(); + /** @type {number|null} */ + this._maxHeadwordLengthCache = null; + /** @type {Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}>} */ + this._directTermIndexByDictionary = new Map(); + /** @type {import('@sqlite.org/sqlite-wasm').sqlite3_module|null} */ + this._termsVtabModule = null; + /** @type {boolean} */ + this._termsVtabModuleRegistered = false; + /** @type {Map} */ + this._termsVtabCursorState = new Map(); + /** @type {boolean} */ + this._enableSqliteSecondaryIndexes = false; + /** @type {number} */ + this._termContentCompressionMinBytes = 1048576; + /** @type {number} */ + this._rawTermContentPackTargetBytes = DEFAULT_RAW_TERM_CONTENT_PACK_TARGET_BYTES; + /** @type {boolean} */ + this._importDebugLogging = false; + /** @type {number} */ + this._termBulkAddLogIntervalMs = 3000; + /** @type {number} */ + this._termBulkAddFailFastMinRowsPerSecond = 1200; + /** @type {number} */ + this._termBulkAddFailFastSlowBatchMs = 15000; + /** @type {number} */ + this._termBulkAddFailFastMinRowsBeforeCheck = 32768; + /** @type {number} */ + this._termBulkAddFailFastWindowSize = 5; + /** @type {number} */ + this._termBulkAddBatchSize = 25000; + /** @type {boolean} */ + this._adaptiveTermBulkAddBatchSize = true; + /** @type {boolean} */ + this._retryBeginImmediateTransaction = false; + /** @type {boolean} */ + this._skipIntraBatchContentDedup = false; + /** @type {number} */ + this._termBulkAddStagingMaxRows = this._computeDefaultTermBulkAddStagingMaxRows(); + /** @type {boolean} */ + this._termRecordRowAppendFastPath = true; + /** @type {{contentAppendMs: number, termRecordBuildMs: number, termRecordEncodeMs: number, termRecordWriteMs: number, termsVtabInsertMs: number}|null} */ + this._lastBulkAddTermsMetrics = null; + /** @type {TermContentOpfsStore} */ + this._termContentStore = new TermContentOpfsStore(); + /** @type {TermRecordOpfsStore} */ + this._termRecordStore = new TermRecordOpfsStore(); /** * @type {Worker?} */ @@ -78,678 +254,4898 @@ export class DictionaryDatabase { ]); } - /** - * do upgrades for the IndexedDB schema (basically limited to adding new stores when needed) - */ + /** */ async prepare() { - // do not do upgrades in web workers as they are considered to be children of the main thread and are not responsible for database upgrades - const isWorker = self.constructor.name !== 'Window'; - const upgrade = - /** @type {import('database').StructureDefinition[]?} */ - ([ - /** @type {import('database').StructureDefinition} */ - ({ - version: 20, - stores: { - terms: { - primaryKey: {keyPath: 'id', autoIncrement: true}, - indices: ['dictionary', 'expression', 'reading'], - }, - kanji: { - primaryKey: {autoIncrement: true}, - indices: ['dictionary', 'character'], - }, - tagMeta: { - primaryKey: {autoIncrement: true}, - indices: ['dictionary'], - }, - dictionaries: { - primaryKey: {autoIncrement: true}, - indices: ['title', 'version'], - }, - }, - }), - { - version: 30, - stores: { - termMeta: { - primaryKey: {autoIncrement: true}, - indices: ['dictionary', 'expression'], - }, - kanjiMeta: { - primaryKey: {autoIncrement: true}, - indices: ['dictionary', 'character'], - }, - tagMeta: { - primaryKey: {autoIncrement: true}, - indices: ['dictionary', 'name'], - }, - }, - }, - { - version: 40, - stores: { - terms: { - primaryKey: {keyPath: 'id', autoIncrement: true}, - indices: ['dictionary', 'expression', 'reading', 'sequence'], - }, - }, - }, - { - version: 50, - stores: { - terms: { - primaryKey: {keyPath: 'id', autoIncrement: true}, - indices: ['dictionary', 'expression', 'reading', 'sequence', 'expressionReverse', 'readingReverse'], - }, - }, - }, - { - version: 60, - stores: { - media: { - primaryKey: {keyPath: 'id', autoIncrement: true}, - indices: ['dictionary', 'path'], - }, - }, - }, - ]); - await this._db.open( - this._dbName, - 60, - isWorker ? null : upgrade, - ); + if (this._db !== null) { + throw new Error('Database already open'); + } + if (this._isOpening) { + throw new Error('Already opening'); + } - // when we are not a worker ourselves, create a worker which is basically just a wrapper around this class, which we can use to offload some functions to - if (!isWorker) { - this._worker = new Worker('/js/dictionary/dictionary-database-worker-main.js', {type: 'module'}); - this._worker.addEventListener('error', (event) => { - log.log('Worker terminated with error:', event); - }); - this._worker.addEventListener('unhandledrejection', (event) => { - log.log('Unhandled promise rejection in worker:', event); - }); - } else { - // when we are the worker, prepare to need to do some SVG work and load appropriate wasm & fonts - await initWasm(fetch('/lib/resvg.wasm')); + try { + this._isOpening = true; + await this._openConnection(); + await initializeTermContentZstd(); + this._termContentZstdInitialized = true; + await this._deleteLegacyIndexedDb(); + await this._cleanupIncompleteImports(); + await this._cleanupMissingTermRecordShards(); - const font = await fetch('/fonts/NotoSansJP-Regular.ttf'); - const fontData = await font.arrayBuffer(); - this._resvgFontBuffer = new Uint8Array(fontData); + // keep existing draw worker split behaviour. + const isWorker = self.constructor.name !== 'Window'; + if (!isWorker && this._worker === null) { + this._worker = new Worker('/js/dictionary/dictionary-database-worker-main.js', {type: 'module'}); + this._worker.addEventListener('error', (event) => { + log.log('Worker terminated with error:', event); + }); + this._worker.addEventListener('unhandledrejection', (event) => { + log.log('Unhandled promise rejection in worker:', event); + }); + } else if (isWorker && this._resvgFontBuffer === null) { + await initWasm(fetch('/lib/resvg.wasm')); + + const font = await fetch('/fonts/NotoSansJP-Regular.ttf'); + const fontData = await font.arrayBuffer(); + this._resvgFontBuffer = new Uint8Array(fontData); + } + } finally { + this._isOpening = false; } } /** */ async close() { + if (this._db === null) { + throw new Error('Database is not open'); + } + await this._termContentStore.endImportSession(); + await this._termRecordStore.endImportSession(); + if (this._bulkImportTransactionOpen) { + try { + this._db.exec('ROLLBACK'); + } catch (_) { /* NOP */ } + this._bulkImportTransactionOpen = false; + } + this._clearCachedStatements(); + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._invalidateMaxHeadwordLengthCache(); + this._directTermIndexByDictionary.clear(); + this._clearTermsVtabCursorState(); + this._termsVtabModuleRegistered = false; this._db.close(); + this._db = null; + this._usesFallbackStorage = false; } /** * @returns {boolean} */ isPrepared() { - return this._db.isOpen(); + return this._db !== null; + } + + /** + * @returns {boolean} + */ + isOpening() { + return this._isOpening; + } + + /** + * @param {boolean} suspended + * @returns {Promise} + */ + async setSuspended(suspended) { + if (suspended) { + if (this._db !== null) { + await this.close(); + } + return; + } + if (this._db === null) { + await this.prepare(); + } + } + + /** + * @returns {boolean} + */ + usesFallbackStorage() { + return this._usesFallbackStorage; + } + + /** + * @returns {{mode: string, forceFallback: boolean, hasOpfsDbCtor: boolean, hasOpfsImportDb: boolean, hasWasmfsDir: boolean, attempts?: Array<{strategy: string, target: string, flags: string, error: string}>, lastError?: string|null}|null} + */ + getOpenStorageDiagnostics() { + if (this._openStorageDiagnostics === null) { + return null; + } + return {...this._openStorageDiagnostics}; + } + + /** + * @returns {Record|null} + */ + getStartupCleanupIncompleteImportsSummary() { + return this._startupCleanupIncompleteImportsSummary; + } + + /** + * @returns {Record|null} + */ + getStartupCleanupMissingTermRecordShardsSummary() { + return this._startupCleanupMissingTermRecordShardsSummary; + } + + /** */ + async startBulkImport() { + const db = this._requireDb(); + if (this._bulkImportDepth === 0) { + await this._termContentStore.beginImportSession(); + await this._termRecordStore.beginImportSession(); + this._applyImportPragmas(); + this._deferTermsVirtualTableSync = true; + this._termsVirtualTableDirty = false; + this._termEntryContentHasExistingRows = this._asNumber(db.selectValue('SELECT 1 FROM termEntryContent LIMIT 1'), 0) === 1; + for (const dropIndexSql of this._createDropIndexesSql()) { + db.exec(dropIndexSql); + } + this._termEntryContentIdByKey.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termEntryContentCache.clear(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + await this._beginImmediateTransaction(db); + this._bulkImportTransactionOpen = true; + } + ++this._bulkImportDepth; + } + + /** + * @param {((index: number, count: number) => void)?} [onCheckpoint] + * @returns {Promise<{commitMs: number, termContentEndImportSessionMs: number, termContentEndImportSessionFlushPendingWritesMs: number, termContentEndImportSessionAwaitQueuedWritesMs: number, termContentEndImportSessionCloseWritableMs: number, termContentDrainCycleCount: number, termContentWriteCallCount: number, termContentSingleChunkWriteCount: number, termContentMergedWriteCount: number, termContentTotalWriteBytes: number, termContentMergedWriteBytes: number, termContentMaxWriteBytes: number, termContentMergedGroupChunkCount: number, termContentMaxMergedGroupChunkCount: number, termContentFlushDueToBytesCount: number, termContentFlushDueToChunkCount: number, termContentFlushFinalGroupCount: number, termContentWriteCoalesceTargetBytes: number, termContentWriteCoalesceMaxChunks: number, termRecordEndImportSessionMs: number, termsVirtualTableSyncMs: number, createIndexesMs: number, createIndexesCheckpointCount: number, cacheResetMs: number, runtimePragmasMs: number, totalMs: number}|null>} + */ + async finishBulkImport(onCheckpoint = null) { + if (this._bulkImportDepth <= 0) { + return null; + } + --this._bulkImportDepth; + if (this._bulkImportDepth === 0) { + const db = this._requireDb(); + const tFinishBulkImportStart = safePerformance.now(); + let commitMs = 0; + let termContentEndImportSessionMs = 0; + let termContentEndImportSessionFlushPendingWritesMs = 0; + let termContentEndImportSessionAwaitQueuedWritesMs = 0; + let termContentEndImportSessionCloseWritableMs = 0; + let termContentDrainCycleCount = 0; + let termContentWriteCallCount = 0; + let termContentSingleChunkWriteCount = 0; + let termContentMergedWriteCount = 0; + let termContentTotalWriteBytes = 0; + let termContentMergedWriteBytes = 0; + let termContentMaxWriteBytes = 0; + let termContentMergedGroupChunkCount = 0; + let termContentMaxMergedGroupChunkCount = 0; + let termContentFlushDueToBytesCount = 0; + let termContentFlushDueToChunkCount = 0; + let termContentFlushFinalGroupCount = 0; + let termContentWriteCoalesceTargetBytes = 0; + let termContentWriteCoalesceMaxChunks = 0; + let termRecordEndImportSessionMs = 0; + let termsVirtualTableSyncMs = 0; + let createIndexesMs = 0; + let createIndexesCheckpointCount = 0; + let cacheResetMs = 0; + let runtimePragmasMs = 0; + try { + if (this._bulkImportTransactionOpen) { + const tCommitStart = safePerformance.now(); + db.exec('COMMIT'); + commitMs = safePerformance.now() - tCommitStart; + this._bulkImportTransactionOpen = false; + } + const tTermContentEndImportSessionStart = safePerformance.now(); + const termContentEndImportSessionPromise = this._termContentStore.endImportSession() + .then(() => { + termContentEndImportSessionMs = safePerformance.now() - tTermContentEndImportSessionStart; + const metrics = this._termContentStore.getLastEndImportSessionMetrics(); + if (metrics !== null) { + termContentEndImportSessionFlushPendingWritesMs = metrics.flushPendingWritesMs; + termContentEndImportSessionAwaitQueuedWritesMs = metrics.awaitQueuedWritesMs; + termContentEndImportSessionCloseWritableMs = metrics.closeWritableMs; + termContentDrainCycleCount = metrics.drainCycleCount; + termContentWriteCallCount = metrics.writeCallCount; + termContentSingleChunkWriteCount = metrics.singleChunkWriteCount; + termContentMergedWriteCount = metrics.mergedWriteCount; + termContentTotalWriteBytes = metrics.totalWriteBytes; + termContentMergedWriteBytes = metrics.mergedWriteBytes; + termContentMaxWriteBytes = metrics.maxWriteBytes; + termContentMergedGroupChunkCount = metrics.mergedGroupChunkCount; + termContentMaxMergedGroupChunkCount = metrics.maxMergedGroupChunkCount; + termContentFlushDueToBytesCount = metrics.flushDueToBytesCount; + termContentFlushDueToChunkCount = metrics.flushDueToChunkCount; + termContentFlushFinalGroupCount = metrics.flushFinalGroupCount; + termContentWriteCoalesceTargetBytes = metrics.writeCoalesceTargetBytes; + termContentWriteCoalesceMaxChunks = metrics.writeCoalesceMaxChunks; + } + }); + const tTermRecordEndImportSessionStart = safePerformance.now(); + const termRecordEndImportSessionPromise = this._termRecordStore.endImportSession() + .then(() => { + termRecordEndImportSessionMs = safePerformance.now() - tTermRecordEndImportSessionStart; + }); + await Promise.all([termContentEndImportSessionPromise, termRecordEndImportSessionPromise]); + if (this._termsVirtualTableDirty) { + await this._beginImmediateTransaction(db); + try { + const tTermsVirtualTableSyncStart = safePerformance.now(); + await this._syncTermsVirtualTableFromRecordStore(); + termsVirtualTableSyncMs = safePerformance.now() - tTermsVirtualTableSyncStart; + db.exec('COMMIT'); + this._termsVirtualTableDirty = false; + } catch (e) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + throw e; + } + } + const createIndexStatements = this._createIndexesSql(); + const tCreateIndexesStart = safePerformance.now(); + for (let i = 0; i < createIndexStatements.length; ++i) { + db.exec(createIndexStatements[i]); + if (typeof onCheckpoint === 'function') { + onCheckpoint(i + 1, createIndexStatements.length); + } + } + createIndexesMs = safePerformance.now() - tCreateIndexesStart; + createIndexesCheckpointCount = createIndexStatements.length; + const tCacheResetStart = safePerformance.now(); + this._termEntryContentIdByKey.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termEntryContentCache.clear(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + cacheResetMs = safePerformance.now() - tCacheResetStart; + this._deferTermsVirtualTableSync = false; + const tRuntimePragmasStart = safePerformance.now(); + this._applyRuntimePragmas(); + runtimePragmasMs = safePerformance.now() - tRuntimePragmasStart; + const totalMs = safePerformance.now() - tFinishBulkImportStart; + if (this._importDebugLogging) { + log.log( + '[yomitan-db-import] finishBulkImport ' + + `total=${totalMs.toFixed(1)}ms ` + + `commit=${commitMs.toFixed(1)}ms ` + + `termContentEnd=${termContentEndImportSessionMs.toFixed(1)}ms ` + + `termContentFlush=${termContentEndImportSessionFlushPendingWritesMs.toFixed(1)}ms ` + + `termContentAwait=${termContentEndImportSessionAwaitQueuedWritesMs.toFixed(1)}ms ` + + `termContentClose=${termContentEndImportSessionCloseWritableMs.toFixed(1)}ms ` + + `termContentDrainCycles=${termContentDrainCycleCount} ` + + `termContentWrites=${termContentWriteCallCount} ` + + `termContentSingleWrites=${termContentSingleChunkWriteCount} ` + + `termContentMergedWrites=${termContentMergedWriteCount} ` + + `termContentTotalWriteBytes=${termContentTotalWriteBytes} ` + + `termContentMergedWriteBytes=${termContentMergedWriteBytes} ` + + `termContentMaxWriteBytes=${termContentMaxWriteBytes} ` + + `termContentMergedGroupChunks=${termContentMergedGroupChunkCount} ` + + `termContentMaxMergedGroupChunks=${termContentMaxMergedGroupChunkCount} ` + + `termContentFlushDueToBytes=${termContentFlushDueToBytesCount} ` + + `termContentFlushDueToChunks=${termContentFlushDueToChunkCount} ` + + `termContentFlushFinalGroups=${termContentFlushFinalGroupCount} ` + + `termContentWriteCoalesceTargetBytes=${termContentWriteCoalesceTargetBytes} ` + + `termContentWriteCoalesceMaxChunks=${termContentWriteCoalesceMaxChunks} ` + + `termRecordEnd=${termRecordEndImportSessionMs.toFixed(1)}ms ` + + `termsVtabSync=${termsVirtualTableSyncMs.toFixed(1)}ms ` + + `createIndexes=${createIndexesMs.toFixed(1)}ms ` + + `cacheReset=${cacheResetMs.toFixed(1)}ms ` + + `runtimePragmas=${runtimePragmasMs.toFixed(1)}ms ` + + `indexStatements=${createIndexesCheckpointCount}`, + ); + } + return { + commitMs, + termContentEndImportSessionMs, + termContentEndImportSessionFlushPendingWritesMs, + termContentEndImportSessionAwaitQueuedWritesMs, + termContentEndImportSessionCloseWritableMs, + termContentDrainCycleCount, + termContentWriteCallCount, + termContentSingleChunkWriteCount, + termContentMergedWriteCount, + termContentTotalWriteBytes, + termContentMergedWriteBytes, + termContentMaxWriteBytes, + termContentMergedGroupChunkCount, + termContentMaxMergedGroupChunkCount, + termContentFlushDueToBytesCount, + termContentFlushDueToChunkCount, + termContentFlushFinalGroupCount, + termContentWriteCoalesceTargetBytes, + termContentWriteCoalesceMaxChunks, + termRecordEndImportSessionMs, + termsVirtualTableSyncMs, + createIndexesMs, + createIndexesCheckpointCount, + cacheResetMs, + runtimePragmasMs, + totalMs, + }; + } finally { + if (this._bulkImportTransactionOpen) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + this._bulkImportTransactionOpen = false; + } + await this._termContentStore.endImportSession(); + await this._termRecordStore.endImportSession(); + } + } + } + + /** + * @param {boolean} value + */ + setTermEntryContentDedupEnabled(value) { + this._enableTermEntryContentDedup = value; + } + + /** + * @param {boolean} value + */ + setImportDebugLogging(value) { + this._importDebugLogging = value; + } + + /** + * @param {{termContentStorageMode?: 'baseline'|'raw-bytes'}} [options] + */ + setImportOptimizationFlags(options = {}) { + this._adaptiveTermBulkAddBatchSize = true; + this._retryBeginImmediateTransaction = false; + this._skipIntraBatchContentDedup = false; + this._termBulkAddStagingMaxRows = this._computeDefaultTermBulkAddStagingMaxRows(); + this._termRecordRowAppendFastPath = true; + this._termContentStorageMode = (options.termContentStorageMode === TERM_CONTENT_STORAGE_MODE_RAW_BYTES) ? + options.termContentStorageMode : + TERM_CONTENT_STORAGE_MODE_BASELINE; + this._termContentCompressionMinBytes = 1048576; + this._rawTermContentPackTargetBytes = DEFAULT_RAW_TERM_CONTENT_PACK_TARGET_BYTES; + this._termContentStore.setImportStorageMode(this._termContentStorageMode); + this._termContentStore.setWriteCoalesceMaxChunksOverride(null); } /** * @returns {Promise} */ async purge() { - if (this._db.isOpening()) { + if (this._isOpening) { throw new Error('Cannot purge database while opening'); } - if (this._db.isOpen()) { + + this._invalidateMaxHeadwordLengthCache(); + + if (this._db !== null) { + if (this._bulkImportTransactionOpen) { + try { + this._db.exec('ROLLBACK'); + } catch (_) { /* NOP */ } + this._bulkImportTransactionOpen = false; + } + this._applyRuntimePragmas(); + this._clearCachedStatements(); this._db.close(); + this._db = null; + this._usesFallbackStorage = false; } + await this._termContentStore.reset(); + await this._termRecordStore.reset(); + if (this._worker !== null) { this._worker.terminate(); this._worker = null; } + let result = false; try { - await Database.deleteDatabase(this._dbName); - result = true; + result = await deleteOpfsDatabaseFiles(); } catch (e) { log.error(e); } + await this.prepare(); + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); return result; } + /** + * @returns {Promise} + */ + async exportDatabase() { + const db = this._requireDb(); + const sqlite3 = this._requireSqlite3(); + const pageCount = this._asNumber(db.selectValue('PRAGMA page_count'), -1); + const pageSize = this._asNumber(db.selectValue('PRAGMA page_size'), -1); + const freelistCount = this._asNumber(db.selectValue('PRAGMA freelist_count'), -1); + const approxDbBytes = (pageCount > 0 && pageSize > 0) ? pageCount * pageSize : -1; + + // Release cached prepared statements before serialization to reduce wasm heap pressure. + this._clearCachedStatements(); + try { + db.exec('PRAGMA shrink_memory'); + } catch (_) { + // Not all sqlite builds expose shrink_memory; ignore when unavailable. + } + try { + db.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch (_) { + // In-memory/non-WAL databases may reject checkpoint pragmas. + } + try { + const snapshotDb = await this._createExportSnapshotDatabase(); + let exported; + try { + exported = this._exportDatabaseImage(snapshotDb, sqlite3); + } finally { + try { + snapshotDb.close(); + } catch (_) { + // Ignore snapshot close failures after export. + } + } + if (exported === null || exported.byteLength === 0) { + throw new Error('Database export returned an empty payload'); + } + return exported; + } catch (e) { + const storageDiagnostics = this.getOpenStorageDiagnostics(); + const wrappedError = new Error( + `Database serialization failed: ${String(e && typeof e === 'object' && 'message' in e ? Reflect.get(e, 'message') : e)} ` + + `(usesFallbackStorage=${String(this._usesFallbackStorage)} ` + + `pageCount=${String(pageCount)} pageSize=${String(pageSize)} freelistCount=${String(freelistCount)} ` + + `approxDbBytes=${String(approxDbBytes)} storageDiagnostics=${JSON.stringify(storageDiagnostics)})`, + ); + log.warn(wrappedError); + throw wrappedError; + } + } + + /** + * @returns {Promise} + */ + async _createExportSnapshotDatabase() { + const sourceDb = this._requireDb(); + const sqlite3 = this._requireSqlite3(); + const snapshotDb = new sqlite3.oo1.DB(':memory:', 'ct'); + snapshotDb.exec(` + PRAGMA journal_mode = DELETE; + PRAGMA synchronous = NORMAL; + PRAGMA user_version = ${CURRENT_DICTIONARY_SCHEMA_VERSION}; + + CREATE TABLE dictionaries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + version INTEGER NOT NULL, + summaryJson TEXT NOT NULL + ); + + CREATE TABLE terms ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + expression TEXT NOT NULL, + reading TEXT NOT NULL, + expressionReverse TEXT, + readingReverse TEXT, + definitionTags TEXT, + termTags TEXT, + rules TEXT, + score INTEGER, + glossaryJson TEXT NOT NULL, + sequence INTEGER + ); + + CREATE TABLE termMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + expression TEXT NOT NULL, + mode TEXT NOT NULL, + dataJson TEXT NOT NULL + ); + + CREATE TABLE kanji ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + character TEXT NOT NULL, + onyomi TEXT, + kunyomi TEXT, + tags TEXT, + meaningsJson TEXT NOT NULL, + statsJson TEXT + ); + + CREATE TABLE kanjiMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + character TEXT NOT NULL, + mode TEXT NOT NULL, + dataJson TEXT NOT NULL + ); + + CREATE TABLE tagMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + name TEXT NOT NULL, + category TEXT, + ord INTEGER, + notes TEXT, + score INTEGER + ); + + CREATE TABLE media ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + path TEXT NOT NULL, + mediaType TEXT NOT NULL, + width INTEGER NOT NULL, + height INTEGER NOT NULL, + content BLOB NOT NULL + ); + `); + + snapshotDb.exec('BEGIN'); + try { + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT title, version, summaryJson FROM dictionaries ORDER BY id ASC', + 'INSERT INTO dictionaries(title, version, summaryJson) VALUES($title, $version, $summaryJson)', + (row) => ({ + $title: this._asString(row.title), + $version: this._asNumber(row.version, 0), + $summaryJson: this._asString(row.summaryJson), + }), + ); + await this._copyTermRowsToDatabase(snapshotDb); + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT dictionary, expression, mode, dataJson FROM termMeta ORDER BY id ASC', + 'INSERT INTO termMeta(dictionary, expression, mode, dataJson) VALUES($dictionary, $expression, $mode, $dataJson)', + (row) => ({ + $dictionary: this._asString(row.dictionary), + $expression: this._asString(row.expression), + $mode: this._asString(row.mode), + $dataJson: this._asString(row.dataJson), + }), + ); + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT dictionary, character, onyomi, kunyomi, tags, meaningsJson, statsJson FROM kanji ORDER BY id ASC', + 'INSERT INTO kanji(dictionary, character, onyomi, kunyomi, tags, meaningsJson, statsJson) VALUES($dictionary, $character, $onyomi, $kunyomi, $tags, $meaningsJson, $statsJson)', + (row) => ({ + $dictionary: this._asString(row.dictionary), + $character: this._asString(row.character), + $onyomi: this._asNullableString(row.onyomi), + $kunyomi: this._asNullableString(row.kunyomi), + $tags: this._asNullableString(row.tags), + $meaningsJson: this._asString(row.meaningsJson), + $statsJson: this._asNullableString(row.statsJson), + }), + ); + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT dictionary, character, mode, dataJson FROM kanjiMeta ORDER BY id ASC', + 'INSERT INTO kanjiMeta(dictionary, character, mode, dataJson) VALUES($dictionary, $character, $mode, $dataJson)', + (row) => ({ + $dictionary: this._asString(row.dictionary), + $character: this._asString(row.character), + $mode: this._asString(row.mode), + $dataJson: this._asString(row.dataJson), + }), + ); + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT dictionary, name, category, ord, notes, score FROM tagMeta ORDER BY id ASC', + 'INSERT INTO tagMeta(dictionary, name, category, ord, notes, score) VALUES($dictionary, $name, $category, $ord, $notes, $score)', + (row) => ({ + $dictionary: this._asString(row.dictionary), + $name: this._asString(row.name), + $category: this._asNullableString(row.category), + $ord: this._asNullableNumber(row.ord), + $notes: this._asNullableString(row.notes), + $score: this._asNullableNumber(row.score), + }), + ); + await this._copyRowsToDatabase( + sourceDb, + snapshotDb, + 'SELECT dictionary, path, mediaType, width, height, content FROM media ORDER BY id ASC', + 'INSERT INTO media(dictionary, path, mediaType, width, height, content) VALUES($dictionary, $path, $mediaType, $width, $height, $content)', + (row) => ({ + $dictionary: this._asString(row.dictionary), + $path: this._asString(row.path), + $mediaType: this._asString(row.mediaType), + $width: this._asNumber(row.width, 0), + $height: this._asNumber(row.height, 0), + $content: this._toUint8Array(row.content) ?? new Uint8Array(), + }), + ); + snapshotDb.exec('COMMIT'); + } catch (error) { + try { + snapshotDb.exec('ROLLBACK'); + } catch (_) { + // Ignore rollback errors after export snapshot failures. + } + try { + snapshotDb.close(); + } catch (_) { + // Ignore close errors while surfacing the original snapshot failure. + } + throw error; + } + + return snapshotDb; + } + + /** + * @param {import('@sqlite.org/sqlite-wasm').Database} sourceDb + * @param {import('@sqlite.org/sqlite-wasm').Database} targetDb + * @param {string} sourceSql + * @param {string} targetSql + * @param {(row: import('core').SafeAny) => Record} createBind + * @returns {Promise} + */ + async _copyRowsToDatabase(sourceDb, targetDb, sourceSql, targetSql, createBind) { + const sourceStmt = /** @type {import('@sqlite.org/sqlite-wasm').PreparedStatement} */ (sourceDb.prepare(sourceSql)); + const targetStmt = /** @type {import('@sqlite.org/sqlite-wasm').PreparedStatement} */ (targetDb.prepare(targetSql)); + try { + sourceStmt.reset(true); + while (sourceStmt.step()) { + const row = /** @type {import('core').SafeAny} */ (sourceStmt.get({})); + targetStmt.reset(true); + targetStmt.bind(createBind(row)); + targetStmt.step(); + } + } finally { + try { + sourceStmt.finalize(); + } catch (_) { + // Ignore finalization errors for export snapshot reads. + } + try { + targetStmt.finalize(); + } catch (_) { + // Ignore finalization errors for export snapshot writes. + } + } + } + + /** + * Rebuilds legacy inline term rows from the external term record/content stores + * so exports do not depend on the runtime virtual table module being available. + * @param {import('@sqlite.org/sqlite-wasm').Database} targetDb + * @returns {Promise} + */ + async _copyTermRowsToDatabase(targetDb) { + await this._termContentStore.ensureLoadedForRead(); + const targetStmt = /** @type {import('@sqlite.org/sqlite-wasm').PreparedStatement} */ (targetDb.prepare( + 'INSERT INTO terms(dictionary, expression, reading, expressionReverse, readingReverse, definitionTags, termTags, rules, score, glossaryJson, sequence) VALUES($dictionary, $expression, $reading, $expressionReverse, $readingReverse, $definitionTags, $termTags, $rules, $score, $glossaryJson, $sequence)', + )); + try { + for (const id of this._termRecordStore.getAllIds()) { + const record = this._termRecordStore.getById(id); + if (typeof record === 'undefined') { continue; } + const row = await this._deserializeTermRow({ + id, + dictionary: record.dictionary, + expression: record.expression, + reading: record.reading, + expressionReverse: record.expressionReverse, + readingReverse: record.readingReverse, + entryContentId: null, + entryContentOffset: record.entryContentOffset, + entryContentLength: record.entryContentLength, + entryContentDictName: record.entryContentDictName, + definitionTags: '', + termTags: '', + rules: '', + score: record.score, + glossaryJson: '[]', + sequence: record.sequence, + }); + targetStmt.reset(true); + targetStmt.bind({ + $dictionary: row.dictionary, + $expression: row.expression, + $reading: row.reading, + $expressionReverse: row.expressionReverse, + $readingReverse: row.readingReverse, + $definitionTags: row.definitionTags, + $termTags: typeof row.termTags === 'string' ? row.termTags : null, + $rules: row.rules, + $score: row.score, + $glossaryJson: JSON.stringify(row.glossary), + $sequence: row.sequence, + }); + targetStmt.step(); + } + } finally { + try { + targetStmt.finalize(); + } catch (_) { + // Ignore finalization errors for export snapshot writes. + } + } + } + + /** + * @param {import('@sqlite.org/sqlite-wasm').Database} db + * @param {import('@sqlite.org/sqlite-wasm').Sqlite3Static} sqlite3 + * @returns {ArrayBuffer|null} + * @throws {Error} + */ + _exportDatabaseImage(db, sqlite3) { + /** @type {ArrayBuffer|null} */ + let exported = null; + const exportBinaryImage = /** @type {unknown} */ (Reflect.get(db, 'exportBinaryImage')); + if (typeof exportBinaryImage === 'function') { + try { + const raw = /** @type {() => Uint8Array|ArrayBuffer} */ (exportBinaryImage).call(db); + const bytes = raw instanceof Uint8Array ? raw : new Uint8Array(raw); + if (bytes.byteLength > 0) { + exported = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength); + } + } catch (_) { + // Fall back to sqlite3_js_db_export below. + } + } + if (exported !== null) { + return exported; + } + + const dbPointer = db.pointer; + if (typeof dbPointer !== 'number') { + throw new Error('sqlite database pointer is unavailable'); + } + const raw = sqlite3.capi.sqlite3_js_db_export(dbPointer); + const bytes = raw instanceof Uint8Array ? raw : new Uint8Array(raw); + if (bytes.byteLength > 0) { + return bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength); + } + return null; + } + + /** + * @param {ArrayBuffer} content + */ + async importDatabase(content) { + const sqlite3 = await getSqlite3(); + + if (this._db !== null) { + this._clearCachedStatements(); + this._db.close(); + this._db = null; + this._usesFallbackStorage = false; + } + + await this._termContentStore.prepare(); + await this._termRecordStore.prepare(); + await this._termContentStore.reset(); + await this._termRecordStore.reset(); + + await importOpfsDatabase(content); + + this._sqlite3 = sqlite3; + await this._openConnection(); + this._invalidateMaxHeadwordLengthCache(); + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + } + /** * @param {string} dictionaryName * @param {number} progressRate * @param {import('dictionary-database').DeleteDictionaryProgressCallback} onProgress */ async deleteDictionary(dictionaryName, progressRate, onProgress) { - /** @type {[objectStoreName: import('dictionary-database').ObjectStoreName, key: string][][]} */ - const targetGroups = [ - [ - ['kanji', 'dictionary'], - ['kanjiMeta', 'dictionary'], - ['terms', 'dictionary'], - ['termMeta', 'dictionary'], - ['tagMeta', 'dictionary'], - ['media', 'dictionary'], - ], - [ - ['dictionaries', 'title'], - ], - ]; + const db = this._requireDb(); - let storeCount = 0; - for (const targets of targetGroups) { - storeCount += targets.length; - } + /** @type {[table: string, keyColumn: string][]} */ + const targets = [ + ['kanji', 'dictionary'], + ['kanjiMeta', 'dictionary'], + ['termMeta', 'dictionary'], + ['tagMeta', 'dictionary'], + ['media', 'dictionary'], + ['dictionaries', 'title'], + ]; /** @type {import('dictionary-database').DeleteDictionaryProgressData} */ const progressData = { count: 0, processed: 0, - storeCount, + storeCount: targets.length + 1, storesProcesed: 0, }; - /** - * @param {IDBValidKey[]} keys - * @returns {IDBValidKey[]} - */ - const filterKeys = (keys) => { + /** @type {number[]} */ + const counts = []; + const termCount = this._termRecordStore.getDictionaryIndex(dictionaryName).expression.size > 0 ? + [...this._termRecordStore.getDictionaryIndex(dictionaryName).expression.values()].reduce((sum, list) => sum + list.length, 0) : + 0; + progressData.count += termCount; + counts.push(termCount); + for (const [table, keyColumn] of targets) { + const count = this._asNumber(db.selectValue(`SELECT COUNT(*) FROM ${table} WHERE ${keyColumn} = $value`, {$value: dictionaryName}), 0); + counts.push(count); + progressData.count += count; ++progressData.storesProcesed; - progressData.count += keys.length; onProgress(progressData); - return keys; - }; - const onProgressWrapper = () => { - const processed = progressData.processed + 1; - progressData.processed = processed; - if ((processed % progressRate) === 0 || processed === progressData.count) { - onProgress(progressData); - } - }; + } + + progressData.storesProcesed = 0; - for (const targets of targetGroups) { - const promises = []; - for (const [objectStoreName, indexName] of targets) { - const query = IDBKeyRange.only(dictionaryName); - const promise = this._db.bulkDelete(objectStoreName, indexName, query, filterKeys, onProgressWrapper); - promises.push(promise); + await this._beginImmediateTransaction(db); + try { + let countIndex = 1; + const deletedTerms = await this._termRecordStore.deleteByDictionary(dictionaryName); + this._termsVirtualTableDirty = true; + progressData.processed += deletedTerms; + ++progressData.storesProcesed; + onProgress(progressData); + for (let i = 0; i < targets.length; ++i) { + const [table, keyColumn] = targets[i]; + db.exec({sql: `DELETE FROM ${table} WHERE ${keyColumn} = $value`, bind: {$value: dictionaryName}}); + progressData.processed += counts[countIndex++]; + ++progressData.storesProcesed; + if ((progressData.processed % progressRate) === 0 || progressData.processed >= progressData.count) { + onProgress(progressData); + } } - await Promise.all(promises); + db.exec('COMMIT'); + } catch (e) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + throw e; } + + onProgress(progressData); + this._invalidateMaxHeadwordLengthCache(); + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); } /** - * @param {string[]} termList - * @param {import('dictionary-database').DictionarySet} dictionaries - * @param {import('dictionary-database').MatchType} matchType - * @returns {Promise} + * @param {string} sql + * @returns {import('@sqlite.org/sqlite-wasm').PreparedStatement} */ - findTermsBulk(termList, dictionaries, matchType) { - const visited = new Set(); - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row) => { - if (!dictionaries.has(row.dictionary)) { return false; } - const {id} = row; - if (visited.has(id)) { return false; } - visited.add(id); - return true; - }; + _getCachedStatement(sql) { + const cached = this._statementCache.get(sql); + if (typeof cached !== 'undefined') { + this._statementCache.delete(sql); + this._statementCache.set(sql, cached); + return cached; + } + const db = this._requireDb(); + const created = /** @type {import('@sqlite.org/sqlite-wasm').PreparedStatement} */ (db.prepare(sql)); + while (this._statementCache.size >= this._statementCacheMaxEntries) { + const first = this._statementCache.entries().next(); + if (first.done) { + break; + } + this._statementCache.delete(first.value[0]); + try { + first.value[1].finalize(); + } catch (_) { + // NOP + } + } + this._statementCache.set(sql, created); + return created; + } - const indexNames = (matchType === 'suffix') ? ['expressionReverse', 'readingReverse'] : ['expression', 'reading']; + /** */ + _clearCachedStatements() { + for (const stmt of this._statementCache.values()) { + try { + stmt.finalize(); + } catch (_) { + // NOP + } + } + this._statementCache.clear(); + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + } - let createQuery = this._createOnlyQuery1; - switch (matchType) { - case 'prefix': - createQuery = this._createBoundQuery1; - break; - case 'suffix': - createQuery = this._createBoundQuery2; - break; + /** */ + _invalidateMaxHeadwordLengthCache() { + this._maxHeadwordLengthCache = null; + } + + /** + * @param {Iterable} values + * @param {string} prefix + * @returns {{clause: string, bind: Record}} + */ + _buildTextInClause(values, prefix) { + /** @type {string[]} */ + const placeholders = []; + /** @type {Record} */ + const bind = {}; + let index = 0; + for (const value of values) { + const key = `${prefix}${index++}`; + placeholders.push(`$${key}`); + bind[`$${key}`] = value; } + return { + clause: placeholders.length > 0 ? placeholders.join(', ') : "''", + bind, + }; + } - const createResult = this._createTermGeneric.bind(this, matchType); + /** + * @param {Iterable} values + * @param {string} prefix + * @returns {{clause: string, bind: Record}} + */ + _buildNumberInClause(values, prefix) { + /** @type {string[]} */ + const placeholders = []; + /** @type {Record} */ + const bind = {}; + let index = 0; + for (const value of values) { + const key = `${prefix}${index++}`; + placeholders.push(`$${key}`); + bind[`$${key}`] = value; + } + return { + clause: placeholders.length > 0 ? placeholders.join(', ') : '-1', + bind, + }; + } - return this._findMultiBulk('terms', indexNames, termList, createQuery, predicate, createResult); + /** + * @template T + * @param {T[]} values + * @param {number} chunkSize + * @returns {T[][]} + */ + _chunkValues(values, chunkSize) { + /** @type {T[][]} */ + const chunks = []; + if (chunkSize <= 0) { + return chunks; + } + for (let i = 0; i < values.length; i += chunkSize) { + chunks.push(values.slice(i, i + chunkSize)); + } + return chunks; + } + + /** + * @param {string[]} dictionaryNames + * @returns {string} + */ + _getDictionaryCacheKey(dictionaryNames) { + if (dictionaryNames.length <= 1) { + return dictionaryNames[0] ?? ''; + } + return [...dictionaryNames].sort().join('\u001f'); + } + + /** + * @param {string} dictionaryCacheKey + * @param {string} term + * @returns {string} + */ + _createTermExactPresenceCacheKey(dictionaryCacheKey, term) { + return `${dictionaryCacheKey}\u001f${term}`; + } + + /** + * @param {string} dictionaryName + * @returns {{expression: Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}} + */ + _ensureDirectTermIndex(dictionaryName) { + const existing = this._directTermIndexByDictionary.get(dictionaryName); + if (typeof existing !== 'undefined') { + return existing; + } + const index = this._termRecordStore.getDictionaryIndex(dictionaryName); + this._directTermIndexByDictionary.set(dictionaryName, index); + return index; + } + + /** + * @param {import('dictionary-database').DictionarySet} dictionaries + * @returns {string[]} + */ + _getDictionaryNames(dictionaries) { + if (dictionaries instanceof Map) { + return [...dictionaries.keys()]; + } + return [...dictionaries]; + } + + /** + * @param {string[]} terms + * @returns {Map} + */ + _buildTermIndexMap(terms) { + /** @type {Map} */ + const result = new Map(); + for (let i = 0; i < terms.length; ++i) { + const term = terms[i]; + const list = result.get(term); + if (typeof list === 'undefined') { + result.set(term, [i]); + } else { + list.push(i); + } + } + return result; + } + + /** + * @param {string[]} termList + * @param {import('dictionary-database').DictionarySet} dictionaries + * @param {import('dictionary-database').MatchType} matchType + * @returns {Promise} + */ + async findTermsBulk(termList, dictionaries, matchType) { + this._requireDb(); + if (termList.length === 0 || dictionaries.size === 0) { + return []; + } + const visited = new Set(); + /** @type {import('dictionary-database').TermEntry[]} */ + const results = []; + const dictionaryNames = this._getDictionaryNames(dictionaries); + + /** @type {('expression'|'reading'|'expressionReverse'|'readingReverse')[]} */ + const columns = (matchType === 'suffix') ? ['expressionReverse', 'readingReverse'] : ['expression', 'reading']; + + if (matchType === 'exact') { + /** @type {Map} */ + const termIndexMap = new Map(); + /** @type {Map} */ + const idMatches = new Map(); + const dictionaryCacheKey = this._getDictionaryCacheKey(dictionaryNames); + for (let i = 0; i < termList.length; ++i) { + const term = termList[i]; + const termPresenceKey = this._createTermExactPresenceCacheKey(dictionaryCacheKey, term); + const cachedPresence = this._termExactPresenceCache.get(termPresenceKey); + if (cachedPresence === false) { + continue; + } + const existingList = termIndexMap.get(term); + if (typeof existingList === 'undefined') { + termIndexMap.set(term, [i]); + } else { + existingList.push(i); + } + } + if (termIndexMap.size === 0) { + return []; + } + for (const term of termIndexMap.keys()) { + const itemIndexes = /** @type {number[]} */ (termIndexMap.get(term)); + let found = false; + for (const dictionaryName of dictionaryNames) { + const index = this._ensureDirectTermIndex(dictionaryName); + const expressionIds = index.expression.get(term); + if (typeof expressionIds !== 'undefined') { + found = true; + for (const id of expressionIds) { + if (id <= 0 || visited.has(id)) { continue; } + visited.add(id); + const matches = idMatches.get(id); + if (typeof matches === 'undefined') { + idMatches.set(id, itemIndexes.map((itemIndex) => ({matchSource: 'term', itemIndex}))); + } else { + for (const itemIndex of itemIndexes) { + matches.push({matchSource: 'term', itemIndex}); + } + } + } + } + } + for (const dictionaryName of dictionaryNames) { + const index = this._ensureDirectTermIndex(dictionaryName); + const readingIds = index.reading.get(term); + if (typeof readingIds !== 'undefined') { + found = true; + for (const id of readingIds) { + if (id <= 0 || visited.has(id)) { continue; } + visited.add(id); + const matches = idMatches.get(id); + if (typeof matches === 'undefined') { + idMatches.set(id, itemIndexes.map((itemIndex) => ({matchSource: 'reading', itemIndex}))); + } else { + for (const itemIndex of itemIndexes) { + matches.push({matchSource: 'reading', itemIndex}); + } + } + } + } + } + const termPresenceKey = this._createTermExactPresenceCacheKey(dictionaryCacheKey, term); + this._setTermExactPresenceCached(termPresenceKey, found); + } + + if (idMatches.size === 0) { + return []; + } + + const rowsById = await this._fetchTermRowsByIds(idMatches.keys()); + for (const [id, matches] of idMatches) { + const row = rowsById.get(id); + if (typeof row === 'undefined') { continue; } + for (const {matchSource, itemIndex} of matches) { + results.push(this._createTerm(matchSource, 'exact', row, itemIndex)); + } + } + return results; + } + + /** @type {Map} */ + const idMatches = new Map(); + /** @type {Map} */ + const uniqueQueryMap = new Map(); + for (let itemIndex = 0; itemIndex < termList.length; ++itemIndex) { + const term = termList[itemIndex]; + const query = matchType === 'suffix' ? stringReverse(term) : term; + if (query.length === 0) { continue; } + if (!uniqueQueryMap.has(query)) { + uniqueQueryMap.set(query, {term, query, itemIndex}); + } + } + const dictionaryCacheKey = this._getDictionaryCacheKey(dictionaryNames); + const negativeCachePrefix = `${matchType}\u001f${dictionaryCacheKey}\u001f`; + const queriesToCheck = [...uniqueQueryMap.values()].filter(({query}) => !this._termPrefixNegativeCache.has(`${negativeCachePrefix}${query}`)); + /** @type {Set} */ + const foundQueries = new Set(); + + for (let indexIndex = 0; indexIndex < columns.length; ++indexIndex) { + const column = columns[indexIndex]; + /** @type {Map|null} */ + let lookup = null; + for (const queryData of queriesToCheck) { + for (const dictionaryName of dictionaryNames) { + const index = this._ensureDirectTermIndex(dictionaryName); + switch (column) { + case 'expression': + lookup = index.expression; + break; + case 'reading': + lookup = index.reading; + break; + case 'expressionReverse': + lookup = index.expressionReverse; + break; + case 'readingReverse': + lookup = index.readingReverse; + break; + default: + lookup = null; + break; + } + if (lookup === null) { continue; } + for (const [value, ids] of lookup.entries()) { + if (!value.startsWith(queryData.query)) { continue; } + foundQueries.add(queryData.query); + for (const id of ids) { + if (id <= 0 || visited.has(id)) { continue; } + visited.add(id); + const matchSource = (indexIndex === 0) ? 'term' : 'reading'; + const matchType2 = (value === queryData.term) ? 'exact' : matchType; + idMatches.set(id, {matchSource, matchType: matchType2, itemIndex: queryData.itemIndex}); + } + } + } + } + } + for (const {query} of queriesToCheck) { + const key = `${negativeCachePrefix}${query}`; + if (foundQueries.has(query)) { + this._termPrefixNegativeCache.delete(key); + } else { + this._termPrefixNegativeCache.set(key, true); + } + } + if (this._termPrefixNegativeCache.size > 50000) { + this._termPrefixNegativeCache.clear(); + } + + const rowsById = await this._fetchTermRowsByIds(idMatches.keys()); + for (const [id, {matchSource, matchType: matchType2, itemIndex}] of idMatches) { + const row = rowsById.get(id); + if (typeof row === 'undefined') { continue; } + results.push(this._createTerm(matchSource, matchType2, row, itemIndex)); + } + return results; + } + + /** + * @param {import('dictionary-database').TermExactRequest[]} termList + * @param {import('dictionary-database').DictionarySet} dictionaries + * @returns {Promise} + */ + async findTermsExactBulk(termList, dictionaries) { + this._requireDb(); + if (termList.length === 0 || dictionaries.size === 0) { + return []; + } + /** @type {import('dictionary-database').TermEntry[]} */ + const results = []; + const dictionaryNames = this._getDictionaryNames(dictionaries); + /** @type {Map} */ + const termReadingIndexes = new Map(); + for (let itemIndex = 0; itemIndex < termList.length; ++itemIndex) { + const item = termList[itemIndex]; + const key = `${item.term}\u001f${item.reading}`; + const itemIndexes = termReadingIndexes.get(key); + if (typeof itemIndexes === 'undefined') { + termReadingIndexes.set(key, [itemIndex]); + } else { + itemIndexes.push(itemIndex); + } + } + const uniquePairs = [...termReadingIndexes.keys()]; + /** @type {Map} */ + const idMatches = new Map(); + for (const pair of uniquePairs) { + const itemIndexes = termReadingIndexes.get(pair); + if (typeof itemIndexes === 'undefined') { continue; } + for (const dictionaryName of dictionaryNames) { + const index = this._ensureDirectTermIndex(dictionaryName); + const ids = index.pair.get(pair); + if (typeof ids === 'undefined') { continue; } + for (const id of ids) { + if (id <= 0) { continue; } + const existingIndexes = idMatches.get(id); + if (typeof existingIndexes === 'undefined') { + idMatches.set(id, [...itemIndexes]); + } else { + for (const itemIndex of itemIndexes) { + existingIndexes.push(itemIndex); + } + } + } + } + } + + const rowsById = await this._fetchTermRowsByIds(idMatches.keys()); + for (const [id, itemIndexes] of idMatches) { + const row = rowsById.get(id); + if (typeof row === 'undefined') { continue; } + for (const itemIndex of itemIndexes) { + results.push(this._createTerm('term', 'exact', row, itemIndex)); + } + } + + return results; + } + + /** + * @param {import('dictionary-database').DictionaryAndQueryRequest[]} items + * @returns {Promise} + */ + async findTermsBySequenceBulk(items) { + this._requireDb(); + if (items.length === 0) { + return []; + } + /** @type {import('dictionary-database').TermEntry[]} */ + const results = []; + /** @type {Map} */ + const dictionarySequenceIndexes = new Map(); + for (let itemIndex = 0; itemIndex < items.length; ++itemIndex) { + const item = items[itemIndex]; + const sequence = this._asNumber(item.query, -1); + if (sequence < 0) { continue; } + const key = `${item.dictionary}\u001f${sequence}`; + const itemIndexes = dictionarySequenceIndexes.get(key); + if (typeof itemIndexes === 'undefined') { + dictionarySequenceIndexes.set(key, [itemIndex]); + } else { + itemIndexes.push(itemIndex); + } + } + if (dictionarySequenceIndexes.size === 0) { + return []; + } + const dictionaryNames = [...new Set(items.map((item) => item.dictionary))]; + const sequenceValues = [...new Set(items.map((item) => this._asNumber(item.query, -1)).filter((value) => value >= 0))]; + /** @type {Map} */ + const idMatches = new Map(); + for (const dictionaryName of dictionaryNames) { + const index = this._ensureDirectTermIndex(dictionaryName); + for (const sequence of sequenceValues) { + const ids = index.sequence.get(sequence); + if (typeof ids === 'undefined') { continue; } + const key = `${dictionaryName}\u001f${sequence}`; + const itemIndexes = dictionarySequenceIndexes.get(key); + if (typeof itemIndexes === 'undefined') { continue; } + for (const id of ids) { + if (id <= 0) { continue; } + const existingIndexes = idMatches.get(id); + if (typeof existingIndexes === 'undefined') { + idMatches.set(id, [...itemIndexes]); + } else { + for (const itemIndex of itemIndexes) { + existingIndexes.push(itemIndex); + } + } + } + } + } + + const rowsById = await this._fetchTermRowsByIds(idMatches.keys()); + for (const [id, itemIndexes] of idMatches) { + const row = rowsById.get(id); + if (typeof row === 'undefined') { continue; } + for (const itemIndex of itemIndexes) { + results.push(this._createTerm('sequence', 'exact', row, itemIndex)); + } + } + + return results; + } + + /** + * @param {string[]} termList + * @param {import('dictionary-database').DictionarySet} dictionaries + * @returns {Promise} + */ + async findTermMetaBulk(termList, dictionaries) { + if (termList.length === 0 || dictionaries.size === 0) { + return []; + } + /** @type {import('dictionary-database').TermMeta[]} */ + const results = []; + const termIndexMap = this._buildTermIndexMap(termList); + const dictionaryNames = this._getDictionaryNames(dictionaries); + const {clause: termInClause, bind: termBind} = this._buildTextInClause(termIndexMap.keys(), 'term'); + const {clause: dictionaryInClause, bind: dictionaryBind} = this._buildTextInClause(dictionaryNames, 'dict'); + const sql = `SELECT * FROM termMeta WHERE expression IN (${termInClause}) AND dictionary IN (${dictionaryInClause})`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind({...termBind, ...dictionaryBind}); + while (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + const expression = this._asString(row.expression); + const itemIndexes = termIndexMap.get(expression); + if (typeof itemIndexes === 'undefined') { continue; } + const converted = this._deserializeTermMetaRow(row); + for (const itemIndex of itemIndexes) { + results.push(this._createTermMeta(converted, {itemIndex, indexIndex: 0, item: expression})); + } + } + + return results; + } + + /** + * @param {string[]} kanjiList + * @param {import('dictionary-database').DictionarySet} dictionaries + * @returns {Promise} + */ + async findKanjiBulk(kanjiList, dictionaries) { + if (kanjiList.length === 0 || dictionaries.size === 0) { + return []; + } + /** @type {import('dictionary-database').KanjiEntry[]} */ + const results = []; + const characterIndexMap = this._buildTermIndexMap(kanjiList); + const dictionaryNames = this._getDictionaryNames(dictionaries); + const {clause: characterInClause, bind: characterBind} = this._buildTextInClause(characterIndexMap.keys(), 'ch'); + const {clause: dictionaryInClause, bind: dictionaryBind} = this._buildTextInClause(dictionaryNames, 'dict'); + const sql = `SELECT * FROM kanji WHERE character IN (${characterInClause}) AND dictionary IN (${dictionaryInClause})`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind({...characterBind, ...dictionaryBind}); + while (stmt.step()) { + const converted = this._deserializeKanjiRow(/** @type {import('core').SafeAny} */ (stmt.get({}))); + const itemIndexes = characterIndexMap.get(converted.character); + if (typeof itemIndexes === 'undefined') { continue; } + for (const itemIndex of itemIndexes) { + results.push(this._createKanji(converted, {itemIndex, indexIndex: 0, item: converted.character})); + } + } + + return results; + } + + /** + * @param {string[]} kanjiList + * @param {import('dictionary-database').DictionarySet} dictionaries + * @returns {Promise} + */ + async findKanjiMetaBulk(kanjiList, dictionaries) { + if (kanjiList.length === 0 || dictionaries.size === 0) { + return []; + } + /** @type {import('dictionary-database').KanjiMeta[]} */ + const results = []; + const characterIndexMap = this._buildTermIndexMap(kanjiList); + const dictionaryNames = this._getDictionaryNames(dictionaries); + const {clause: characterInClause, bind: characterBind} = this._buildTextInClause(characterIndexMap.keys(), 'ch'); + const {clause: dictionaryInClause, bind: dictionaryBind} = this._buildTextInClause(dictionaryNames, 'dict'); + const sql = `SELECT * FROM kanjiMeta WHERE character IN (${characterInClause}) AND dictionary IN (${dictionaryInClause})`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind({...characterBind, ...dictionaryBind}); + while (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + const character = this._asString(row.character); + const itemIndexes = characterIndexMap.get(character); + if (typeof itemIndexes === 'undefined') { continue; } + const converted = this._deserializeKanjiMetaRow(row); + for (const itemIndex of itemIndexes) { + results.push(this._createKanjiMeta(converted, {itemIndex, indexIndex: 0, item: character})); + } + } + + return results; + } + + /** + * @param {import('dictionary-database').DictionaryAndQueryRequest[]} items + * @returns {Promise<(import('dictionary-database').Tag|undefined)[]>} + */ + async findTagMetaBulk(items) { + if (items.length === 0) { + return []; + } + const results = new Array(items.length); + /** @type {Map} */ + const requestIndexes = new Map(); + for (let i = 0; i < items.length; ++i) { + const item = items[i]; + const key = `${item.dictionary}\u001f${this._asString(item.query)}`; + const itemIndexes = requestIndexes.get(key); + if (typeof itemIndexes === 'undefined') { + requestIndexes.set(key, [i]); + } else { + itemIndexes.push(i); + } + } + + const uniqueRequests = [...requestIndexes.keys()]; + for (const requestChunk of this._chunkValues(uniqueRequests, 256)) { + /** @type {Record} */ + const bind = {}; + const conditions = []; + for (let i = 0; i < requestChunk.length; ++i) { + const [dictionary, query] = requestChunk[i].split('\u001f'); + const dictionaryKey = `$dictionary${i}`; + const queryKey = `$query${i}`; + bind[dictionaryKey] = dictionary; + bind[queryKey] = query; + conditions.push(`(dictionary = ${dictionaryKey} AND name = ${queryKey})`); + } + const sql = `SELECT name, category, ord as "order", notes, score, dictionary FROM tagMeta WHERE ${conditions.join(' OR ')}`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind(bind); + while (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + const tag = this._deserializeTagRow(row); + const itemIndexes = requestIndexes.get(`${tag.dictionary}\u001f${tag.name}`); + if (typeof itemIndexes === 'undefined') { continue; } + for (const itemIndex of itemIndexes) { + if (typeof results[itemIndex] === 'undefined') { + results[itemIndex] = tag; + } + } + } + } + + return results; + } + + /** + * @param {string} name + * @param {string} dictionary + * @returns {Promise} + */ + async findTagForTitle(name, dictionary) { + const db = this._requireDb(); + const row = db.selectObject( + 'SELECT name, category, ord as "order", notes, score, dictionary FROM tagMeta WHERE name = $name AND dictionary = $dictionary LIMIT 1', + {$name: name, $dictionary: dictionary}, + ); + return typeof row === 'undefined' ? null : this._deserializeTagRow(row); + } + + /** + * @param {import('dictionary-database').MediaRequest[]} items + * @returns {Promise} + */ + async getMedia(items) { + if (items.length === 0) { + return []; + } + /** @type {import('dictionary-database').Media[]} */ + const results = []; + /** @type {Map} */ + const mediaRequestIndexes = new Map(); + for (let itemIndex = 0; itemIndex < items.length; ++itemIndex) { + const item = items[itemIndex]; + const key = `${item.dictionary}\u001f${item.path}`; + const itemIndexes = mediaRequestIndexes.get(key); + if (typeof itemIndexes === 'undefined') { + mediaRequestIndexes.set(key, [itemIndex]); + } else { + itemIndexes.push(itemIndex); + } + } + const uniqueRequests = [...mediaRequestIndexes.keys()]; + for (const requestChunk of this._chunkValues(uniqueRequests, 128)) { + /** @type {Record} */ + const bind = {}; + const conditions = []; + for (let i = 0; i < requestChunk.length; ++i) { + const [dictionary, path] = requestChunk[i].split('\u001f'); + const dictionaryKey = `$dictionary${i}`; + const pathKey = `$path${i}`; + bind[dictionaryKey] = dictionary; + bind[pathKey] = path; + conditions.push(`(dictionary = ${dictionaryKey} AND path = ${pathKey})`); + } + const sql = `SELECT dictionary, path, mediaType, width, height, content FROM media WHERE ${conditions.join(' OR ')}`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind(bind); + while (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + const converted = this._deserializeMediaRow(row); + const itemIndexes = mediaRequestIndexes.get(`${converted.dictionary}\u001f${converted.path}`); + if (typeof itemIndexes === 'undefined') { continue; } + for (const itemIndex of itemIndexes) { + results.push(this._createMedia(converted, {itemIndex, indexIndex: 0, item: items[itemIndex]})); + } + } + } + + return results; + } + + /** + * @param {import('dictionary-database').DrawMediaRequest[]} items + * @param {MessagePort} source + */ + async drawMedia(items, source) { + if (this._worker !== null) { + this._worker.postMessage({action: 'drawMedia', params: {items}}, [source]); + return; + } + + safePerformance.mark('drawMedia:start'); + + /** @type {Map} */ + const groupedItems = new Map(); + for (const item of items) { + const {path, dictionary, canvasIndex, canvasWidth, canvasHeight, generation} = item; + const key = `${path}:::${dictionary}`; + if (!groupedItems.has(key)) { + groupedItems.set(key, {path, dictionary, canvasIndexes: [], canvasWidth, canvasHeight, generation}); + } + groupedItems.get(key)?.canvasIndexes.push(canvasIndex); + } + const groupedItemsArray = [...groupedItems.values()]; + const media = await this.getMedia(groupedItemsArray); + const results = media.map((item) => { + const grouped = groupedItemsArray[item.index]; + return { + ...item, + canvasIndexes: grouped.canvasIndexes, + canvasWidth: grouped.canvasWidth, + canvasHeight: grouped.canvasHeight, + generation: grouped.generation, + }; + }); + + results.sort((a, _b) => (a.mediaType === 'image/svg+xml' ? -1 : 1)); + + safePerformance.mark('drawMedia:draw:start'); + for (const m of results) { + if (m.mediaType === 'image/svg+xml') { + safePerformance.mark('drawMedia:draw:svg:start'); + /** @type {import('@resvg/resvg-wasm').ResvgRenderOptions} */ + const opts = { + fitTo: { + mode: 'width', + value: m.canvasWidth, + }, + font: { + fontBuffers: this._resvgFontBuffer !== null ? [this._resvgFontBuffer] : [], + }, + }; + const resvgJS = new Resvg(new Uint8Array(m.content), opts); + const render = resvgJS.render(); + source.postMessage({action: 'drawBufferToCanvases', params: {buffer: render.pixels.buffer, width: render.width, height: render.height, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [render.pixels.buffer]); + safePerformance.mark('drawMedia:draw:svg:end'); + safePerformance.measure('drawMedia:draw:svg', 'drawMedia:draw:svg:start', 'drawMedia:draw:svg:end'); + } else { + safePerformance.mark('drawMedia:draw:raster:start'); + + if ('serviceWorker' in navigator) { + const imageDecoder = new ImageDecoder({type: m.mediaType, data: m.content}); + await imageDecoder.decode().then((decodedImageResult) => { + source.postMessage({action: 'drawDecodedImageToCanvases', params: {decodedImage: decodedImageResult.image, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [decodedImageResult.image]); + }); + } else { + const image = new Blob([m.content], {type: m.mediaType}); + await createImageBitmap(image, {resizeWidth: m.canvasWidth, resizeHeight: m.canvasHeight, resizeQuality: 'high'}).then((decodedImage) => { + const canvas = new OffscreenCanvas(decodedImage.width, decodedImage.height); + const ctx = canvas.getContext('2d'); + if (ctx !== null) { + ctx.drawImage(decodedImage, 0, 0); + const imageData = ctx.getImageData(0, 0, decodedImage.width, decodedImage.height); + source.postMessage({action: 'drawBufferToCanvases', params: {buffer: imageData.data.buffer, width: decodedImage.width, height: decodedImage.height, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [imageData.data.buffer]); + } + }); + } + safePerformance.mark('drawMedia:draw:raster:end'); + safePerformance.measure('drawMedia:draw:raster', 'drawMedia:draw:raster:start', 'drawMedia:draw:raster:end'); + } + } + safePerformance.mark('drawMedia:draw:end'); + safePerformance.measure('drawMedia:draw', 'drawMedia:draw:start', 'drawMedia:draw:end'); + + safePerformance.mark('drawMedia:end'); + safePerformance.measure('drawMedia', 'drawMedia:start', 'drawMedia:end'); + } + + /** + * @returns {Promise} + */ + async getDictionaryInfo() { + const db = this._requireDb(); + const rows = db.selectObjects('SELECT summaryJson FROM dictionaries ORDER BY id ASC'); + return rows.map((row) => /** @type {import('dictionary-importer').Summary} */ (this._safeParseJson(this._asString(row.summaryJson), {}))); + } + + /** + * @returns {Promise} + */ + async getMaxHeadwordLength() { + const cached = this._maxHeadwordLengthCache; + if (typeof cached === 'number') { + return cached; + } + + const db = this._requireDb(); + const value = db.selectValue(` + SELECT MAX( + CASE + WHEN LENGTH(COALESCE(reading, '')) > LENGTH(COALESCE(expression, '')) THEN LENGTH(COALESCE(reading, '')) + ELSE LENGTH(COALESCE(expression, '')) + END + ) + FROM terms + `); + const maxHeadwordLength = Math.max(0, this._asNumber(value, 0)); + this._maxHeadwordLengthCache = maxHeadwordLength; + return maxHeadwordLength; + } + + /** + * @returns {Promise<{ + * scannedCount: number, + * removedCount: number, + * removedTitles: string[], + * removedEmptyTitleRows: number, + * failedCount: number, + * failedTitles: string[], + * parseErrorCount: number + * }>} + */ + async _cleanupIncompleteImports() { + const db = this._requireDb(); + const rows = db.selectObjects('SELECT id, title, summaryJson FROM dictionaries ORDER BY id ASC'); + if (rows.length === 0) { + const summary = { + scannedCount: 0, + removedCount: 0, + removedTitles: [], + removedEmptyTitleRows: 0, + failedCount: 0, + failedTitles: [], + parseErrorCount: 0, + }; + this._startupCleanupIncompleteImportsSummary = summary; + reportDiagnostics('dictionary-startup-cleanup-summary', summary); + return summary; + } + + /** @type {Set} */ + const dictionaryTitlesToDelete = new Set(); + /** @type {number} */ + let removedEmptyTitleRows = 0; + /** @type {number} */ + let parseErrorCount = 0; + for (const row of rows) { + const id = this._asNumber(row.id, 0); + const title = this._asString(row.title).trim(); + const summaryJson = this._asString(row.summaryJson); + let summaryParseFailed = false; + /** @type {unknown} */ + let summary; + try { + summary = /** @type {unknown} */ (parseJson(summaryJson)); + } catch (_) { + summary = null; + summaryParseFailed = true; + } + if (summaryParseFailed) { + parseErrorCount += 1; + } + const importSuccess = ( + typeof summary === 'object' && + summary !== null && + !Array.isArray(summary) + ) ? + /** @type {unknown} */ (Reflect.get(summary, 'importSuccess')) : + void 0; + if (summary !== null && importSuccess !== false) { + continue; + } + if (title.length === 0) { + db.exec({sql: 'DELETE FROM dictionaries WHERE id = $id', bind: {$id: id}}); + log.warn('Removed incomplete dictionary summary row with empty title.'); + removedEmptyTitleRows += 1; + continue; + } + dictionaryTitlesToDelete.add(title); + } + + /** @type {string[]} */ + const removedTitles = []; + /** @type {string[]} */ + const failedTitles = []; + for (const dictionaryTitle of dictionaryTitlesToDelete) { + try { + await this.deleteDictionary(dictionaryTitle, 1000, () => {}); + log.warn(`Removed incomplete dictionary import during startup: ${dictionaryTitle}`); + removedTitles.push(dictionaryTitle); + } catch (e) { + const error = toError(e); + log.error(new Error(`Failed to remove incomplete dictionary import '${dictionaryTitle}': ${error.message}`)); + failedTitles.push(dictionaryTitle); + } + } + + const summary = { + scannedCount: rows.length, + removedCount: removedTitles.length + removedEmptyTitleRows, + removedTitles: [...removedTitles].sort((a, b) => a.localeCompare(b)), + removedEmptyTitleRows, + failedCount: failedTitles.length, + failedTitles: [...failedTitles].sort((a, b) => a.localeCompare(b)), + parseErrorCount, + }; + this._startupCleanupIncompleteImportsSummary = summary; + reportDiagnostics('dictionary-startup-cleanup-summary', summary); + return summary; + } + + /** + * @returns {Promise<{ + * scannedCount: number, + * expectedTermDictionaryCount: number, + * missingShardDictionaryCount: number, + * missingShardDictionaryNames: string[], + * removedCount: number, + * removedTitles: string[], + * failedCount: number, + * failedTitles: string[], + * parseErrorCount: number, + * shardIntegrity: { + * expectedShardCount: number, + * actualShardCount: number, + * missingShardCount: number, + * missingShardFileNames: string[], + * missingDictionaryNames: string[], + * orphanShardCount: number, + * orphanShardFileNames: string[], + * orphanDictionaryNames: string[], + * removedOrphanShardCount: number, + * invalidShardPayloadCount: number, + * invalidShardFileNames: string[], + * rewroteAllShardsFromMemory: boolean + * } + * }>} + */ + async _cleanupMissingTermRecordShards() { + const db = this._requireDb(); + const rows = db.selectObjects('SELECT title, summaryJson FROM dictionaries ORDER BY id ASC'); + /** @type {string[]} */ + const expectedTermDictionaryNames = []; + let parseErrorCount = 0; + for (const row of rows) { + const title = this._asString(row.title).trim(); + if (title.length === 0) { continue; } + let summary; + try { + summary = /** @type {unknown} */ (parseJson(this._asString(row.summaryJson))); + } catch (_) { + ++parseErrorCount; + continue; + } + if (typeof summary !== 'object' || summary === null || Array.isArray(summary)) { + continue; + } + const counts = /** @type {unknown} */ (Reflect.get(summary, 'counts')); + const terms = (typeof counts === 'object' && counts !== null) ? /** @type {unknown} */ (Reflect.get(counts, 'terms')) : null; + const total = (typeof terms === 'object' && terms !== null) ? this._asNumber(Reflect.get(terms, 'total'), 0) : 0; + if (total > 0) { + expectedTermDictionaryNames.push(title); + } + } + const shardIntegrity = await this._termRecordStore.verifyIntegrity(expectedTermDictionaryNames); + const missingShardDictionaryNames = [...new Set( + (Array.isArray(shardIntegrity.missingDictionaryNames) ? shardIntegrity.missingDictionaryNames : []) + .filter((name) => typeof name === 'string' && name.length > 0), + )].sort((a, b) => a.localeCompare(b)); + + /** @type {string[]} */ + const removedTitles = []; + /** @type {string[]} */ + const failedTitles = []; + for (const title of missingShardDictionaryNames) { + try { + await this.deleteDictionary(title, 1000, () => {}); + removedTitles.push(title); + } catch (e) { + const error = toError(e); + log.error(new Error(`Failed to remove dictionary with missing term-record shard '${title}': ${error.message}`)); + failedTitles.push(title); + } + } + + const summary = { + scannedCount: rows.length, + expectedTermDictionaryCount: expectedTermDictionaryNames.length, + missingShardDictionaryCount: missingShardDictionaryNames.length, + missingShardDictionaryNames, + removedCount: removedTitles.length, + removedTitles: [...removedTitles].sort((a, b) => a.localeCompare(b)), + failedCount: failedTitles.length, + failedTitles: [...failedTitles].sort((a, b) => a.localeCompare(b)), + parseErrorCount, + shardIntegrity, + }; + this._startupCleanupMissingTermRecordShardsSummary = summary; + reportDiagnostics('dictionary-term-record-integrity-summary', summary); + return summary; + } + + /** + * @param {string[]} dictionaryNames + * @param {boolean} getTotal + * @returns {Promise} + */ + async getDictionaryCounts(dictionaryNames, getTotal) { + const db = this._requireDb(); + const tables = ['kanji', 'kanjiMeta', 'termMeta', 'tagMeta', 'media']; + + /** @type {import('dictionary-database').DictionaryCountGroup[]} */ + const counts = []; + + if (getTotal) { + /** @type {import('dictionary-database').DictionaryCountGroup} */ + const total = {terms: this._termRecordStore.size}; + for (const table of tables) { + total[table] = this._asNumber(db.selectValue(`SELECT COUNT(*) FROM ${table}`), 0); + } + counts.push(total); + } + + for (const dictionaryName of dictionaryNames) { + /** @type {import('dictionary-database').DictionaryCountGroup} */ + const countGroup = {terms: 0}; + const termIndex = this._termRecordStore.getDictionaryIndex(dictionaryName); + countGroup.terms = [...termIndex.expression.values()].reduce((sum, list) => sum + list.length, 0); + for (const table of tables) { + countGroup[table] = this._asNumber( + db.selectValue(`SELECT COUNT(*) FROM ${table} WHERE dictionary = $dictionary`, {$dictionary: dictionaryName}), + 0, + ); + } + counts.push(countGroup); + } + + const total = getTotal ? /** @type {import('dictionary-database').DictionaryCountGroup} */ (counts.shift()) : null; + return {total, counts}; + } + + /** + * @param {string} title + * @returns {Promise} + */ + async dictionaryExists(title) { + const db = this._requireDb(); + const value = db.selectValue('SELECT 1 FROM dictionaries WHERE title = $title LIMIT 1', {$title: title}); + return typeof value !== 'undefined'; + } + + /** + * @template {import('dictionary-database').ObjectStoreName} T + * @param {T} objectStoreName + * @param {import('dictionary-database').ObjectStoreData[]} items + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async bulkAdd(objectStoreName, items, start, count) { + const db = this._requireDb(); + + if (start + count > items.length) { + count = items.length - start; + } + if (count <= 0) { return; } + if (objectStoreName === 'terms') { + this._invalidateMaxHeadwordLengthCache(); + this._lastBulkAddTermsMetrics = null; + this._termEntryContentCache.clear(); + if (!this._bulkImportTransactionOpen) { + this._termEntryContentIdByHash.clear(); + this._termEntryContentIdByKey.clear(); + this._clearTermEntryContentMetaCaches(); + } + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + } + + if (objectStoreName === 'terms') { + await this._bulkAddTerms(/** @type {import('dictionary-database').ObjectStoreData<'terms'>[]} */ (items), start, count); + return; + } + const descriptor = this._getBulkInsertDescriptor(objectStoreName); + const useLocalTransaction = !this._bulkImportTransactionOpen; + + if (useLocalTransaction) { + await this._beginImmediateTransaction(db); + } + try { + await this._bulkInsertWithDescriptor(descriptor, items, start, count); + if (useLocalTransaction) { + db.exec('COMMIT'); + } + } catch (e) { + if (useLocalTransaction) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + } + throw e; + } + } + + /** + * @returns {{contentAppendMs: number, termRecordBuildMs: number, termRecordEncodeMs: number, termRecordWriteMs: number, termsVtabInsertMs: number}|null} + */ + getLastBulkAddTermsMetrics() { + return this._lastBulkAddTermsMetrics; + } + + /** + * @param {string} dictionary + * @returns {{contentOffset: number, contentLength: number, contentDictName: string, uncompressedLength: number}|null} + */ + _getSharedGlossaryArtifactMeta(dictionary) { + const cached = this._sharedGlossaryArtifactMetaByDictionary.get(dictionary); + if (typeof cached !== 'undefined') { + return cached; + } + const db = this._requireDb(); + const row = db.selectObject( + 'SELECT contentOffset, contentLength, contentDictName, uncompressedLength FROM sharedGlossaryArtifacts WHERE dictionary = $dictionary LIMIT 1', + {$dictionary: dictionary}, + ); + if (typeof row === 'undefined') { + return null; + } + const meta = { + contentOffset: this._asNumber(row.contentOffset, -1), + contentLength: this._asNumber(row.contentLength, 0), + contentDictName: this._asString(row.contentDictName), + uncompressedLength: this._asNumber(row.uncompressedLength, 0), + }; + this._sharedGlossaryArtifactMetaByDictionary.set(dictionary, meta); + return meta; + } + + /** + * @param {string} dictionary + * @param {number} glossaryOffset + * @param {number} glossaryLength + * @returns {Promise} + */ + async _readCompressedSharedGlossarySlice(dictionary, glossaryOffset, glossaryLength) { + const cached = this._sharedGlossaryArtifactInflatedByDictionary.get(dictionary); + if (cached instanceof Uint8Array) { + return cached.subarray(glossaryOffset, glossaryOffset + glossaryLength); + } + const meta = this._getSharedGlossaryArtifactMeta(dictionary); + if (meta === null || meta.contentOffset < 0 || meta.contentLength <= 0) { + return new Uint8Array(0); + } + const compressedBytes = await this._termContentStore.readSlice(meta.contentOffset, meta.contentLength); + let inflatedBytes = compressedBytes; + if (meta.contentDictName === RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME) { + const defaultHeapSize = meta.uncompressedLength > 0 ? meta.uncompressedLength : (compressedBytes.byteLength * 16); + inflatedBytes = zstdDecompress(compressedBytes, {defaultHeapSize}); + } + this._sharedGlossaryArtifactInflatedByDictionary.set(dictionary, inflatedBytes); + return inflatedBytes.subarray(glossaryOffset, glossaryOffset + glossaryLength); + } + + /** + * @param {string} dictionary + * @param {Uint8Array} bytes + * @param {string} contentDictName + * @param {number} uncompressedLength + * @returns {Promise<{offset: number, length: number}>} + */ + async appendRawSharedGlossaryArtifact(dictionary, bytes, contentDictName, uncompressedLength) { + const spans = await this._termContentStore.appendBatch([bytes]); + const span = spans.length > 0 ? spans[0] : {offset: 0, length: 0}; + const db = this._requireDb(); + db.exec({ + sql: ` + INSERT INTO sharedGlossaryArtifacts(dictionary, contentOffset, contentLength, contentDictName, uncompressedLength) + VALUES($dictionary, $contentOffset, $contentLength, $contentDictName, $uncompressedLength) + ON CONFLICT(dictionary) DO UPDATE SET + contentOffset = excluded.contentOffset, + contentLength = excluded.contentLength, + contentDictName = excluded.contentDictName, + uncompressedLength = excluded.uncompressedLength + `, + bind: { + $dictionary: dictionary, + $contentOffset: span.offset, + $contentLength: span.length, + $contentDictName: contentDictName, + $uncompressedLength: Math.max(0, uncompressedLength), + }, + }); + this._sharedGlossaryArtifactMetaByDictionary.set(dictionary, { + contentOffset: span.offset, + contentLength: span.length, + contentDictName, + uncompressedLength: Math.max(0, uncompressedLength), + }); + this._sharedGlossaryArtifactInflatedByDictionary.delete(dictionary); + return span; + } + + /** + * @param {{table: string, columnsSql: string, rowPlaceholderSql: string, batchSize: number, bindRow: (item: unknown) => import('@sqlite.org/sqlite-wasm').Bindable[]}} descriptor + * @param {unknown[]} items + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async _bulkInsertWithDescriptor(descriptor, items, start, count) { + const {table, columnsSql, rowPlaceholderSql, batchSize, bindRow} = descriptor; + for (let i = start, ii = start + count; i < ii; i += batchSize) { + const chunkCount = Math.min(batchSize, ii - i); + /** @type {string[]} */ + const valueRows = []; + /** @type {import('@sqlite.org/sqlite-wasm').Bindable[]} */ + const bind = []; + for (let j = 0; j < chunkCount; ++j) { + valueRows.push(rowPlaceholderSql); + const rowBind = bindRow(items[i + j]); + for (const value of rowBind) { + bind.push(value); + } + } + const sql = `INSERT INTO ${table}(${columnsSql}) VALUES ${valueRows.join(',')}`; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind(bind); + stmt.step(); + } + } + + /** + * @param {import('dictionary-database').ObjectStoreName} objectStoreName + * @returns {{table: string, columnsSql: string, rowPlaceholderSql: string, batchSize: number, bindRow: (item: unknown) => import('@sqlite.org/sqlite-wasm').Bindable[]}} + * @throws {Error} + */ + _getBulkInsertDescriptor(objectStoreName) { + switch (objectStoreName) { + case 'dictionaries': + return { + table: 'dictionaries', + columnsSql: 'title, version, summaryJson', + rowPlaceholderSql: '(?, ?, ?)', + batchSize: 256, + bindRow: (item) => { + const summary = /** @type {import('dictionary-importer').Summary} */ (item); + return [summary.title, summary.version, JSON.stringify(summary)]; + }, + }; + case 'termMeta': + return { + table: 'termMeta', + columnsSql: 'dictionary, expression, mode, dataJson', + rowPlaceholderSql: '(?, ?, ?, ?)', + batchSize: 2048, + bindRow: (item) => { + const row = /** @type {import('dictionary-database').DatabaseTermMeta} */ (item); + return [row.dictionary, row.expression, row.mode, JSON.stringify(row.data)]; + }, + }; + case 'kanji': + return { + table: 'kanji', + columnsSql: 'dictionary, character, onyomi, kunyomi, tags, meaningsJson, statsJson', + rowPlaceholderSql: '(?, ?, ?, ?, ?, ?, ?)', + batchSize: 1024, + bindRow: (item) => { + const row = /** @type {import('dictionary-database').DatabaseKanjiEntry} */ (item); + return [ + row.dictionary, + row.character, + row.onyomi, + row.kunyomi, + row.tags, + JSON.stringify(row.meanings), + typeof row.stats !== 'undefined' ? JSON.stringify(row.stats) : null, + ]; + }, + }; + case 'kanjiMeta': + return { + table: 'kanjiMeta', + columnsSql: 'dictionary, character, mode, dataJson', + rowPlaceholderSql: '(?, ?, ?, ?)', + batchSize: 2048, + bindRow: (item) => { + const row = /** @type {import('dictionary-database').DatabaseKanjiMeta} */ (item); + return [row.dictionary, row.character, row.mode, JSON.stringify(row.data)]; + }, + }; + case 'tagMeta': + return { + table: 'tagMeta', + columnsSql: 'dictionary, name, category, ord, notes, score', + rowPlaceholderSql: '(?, ?, ?, ?, ?, ?)', + batchSize: 2048, + bindRow: (item) => { + const row = /** @type {import('dictionary-database').Tag} */ (item); + return [row.dictionary, row.name, row.category, row.order, row.notes, row.score]; + }, + }; + case 'media': + return { + table: 'media', + columnsSql: 'dictionary, path, mediaType, width, height, content', + rowPlaceholderSql: '(?, ?, ?, ?, ?, ?)', + batchSize: 8, + bindRow: (item) => { + const row = /** @type {import('dictionary-database').MediaDataArrayBufferContent} */ (item); + return [row.dictionary, row.path, row.mediaType, row.width, row.height, row.content]; + }, + }; + default: + throw new Error(`Unsupported object store: ${objectStoreName}`); + } + } + + /** + * @template {import('dictionary-database').ObjectStoreName} T + * @param {T} objectStoreName + * @param {import('dictionary-database').ObjectStoreData} item + * @returns {Promise} + */ + async addWithResult(objectStoreName, item) { + await this.bulkAdd(objectStoreName, [item], 0, 1); + const db = this._requireDb(); + return this._asNumber(db.selectValue('SELECT last_insert_rowid()'), -1); + } + + /** + * @template {import('dictionary-database').ObjectStoreName} T + * @param {T} objectStoreName + * @param {import('dictionary-database').DatabaseUpdateItem[]} items + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async bulkUpdate(objectStoreName, items, start, count) { + const db = this._requireDb(); + + if (start + count > items.length) { + count = items.length - start; + } + if (count <= 0) { return; } + + switch (objectStoreName) { + case 'dictionaries': + break; + default: + throw new Error(`Unsupported bulkUpdate store: ${objectStoreName}`); + } + + const stmt = this._getCachedStatement('UPDATE dictionaries SET title = $title, version = $version, summaryJson = $summaryJson WHERE id = $id'); + const useLocalTransaction = !this._bulkImportTransactionOpen; + + if (useLocalTransaction) { + await this._beginImmediateTransaction(db); + } + try { + for (let i = start, ii = start + count; i < ii; ++i) { + const {data, primaryKey} = items[i]; + const summary = /** @type {import('dictionary-importer').Summary} */ (data); + stmt.reset(true); + stmt.bind({ + $id: this._asNumber(primaryKey, -1), + $title: summary.title, + $version: summary.version, + $summaryJson: JSON.stringify(summary), + }); + stmt.step(); + } + if (useLocalTransaction) { + db.exec('COMMIT'); + } + } catch (e) { + if (useLocalTransaction) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + } + throw e; + } + } + + /** + * @param {import('dictionary-database').ObjectStoreData<'terms'>[]} items + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async _bulkAddTerms(items, start, count) { + if (!this._enableTermEntryContentDedup) { + await this._bulkAddTermsWithoutContentDedup(items, start, count); + return; + } + if (!this._termContentZstdInitialized) { + await initializeTermContentZstd(); + this._termContentZstdInitialized = true; + } + const db = this._requireDb(); + const useLocalTransaction = !this._bulkImportTransactionOpen; + const tBulkStart = safePerformance.now(); + let lastProgressLog = tBulkStart; + let computeContentMs = 0; + let compressContentMs = 0; + let appendContentMs = 0; + let insertContentSqlMs = 0; + let insertTermsSqlMs = 0; + let insertTermRecordAppendMs = 0; + let insertTermsVtabMs = 0; + let commitMs = 0; + let appendedContentBytes = 0; + let resolvedFromCacheCount = 0; + /** @type {number[]} */ + const contentBatchDurationsMs = []; + /** @type {number[]} */ + const termBatchDurationsMs = []; + /** @type {number[]} */ + const termBatchRowsPerSecond = []; + let failFastConsecutiveLowThroughputWindows = 0; + const termBatchSize = this._getTermBulkAddBatchSizeForCount(count); + const stagingBatchSize = Math.max(512, Math.min(this._termBulkAddStagingMaxRows, termBatchSize)); + const contentBatchSize = 8192; + const shouldDedupWithinBatch = !this._skipIntraBatchContentDedup; + let processedRowCount = 0; + let insertedRowCount = 0; + let totalPendingContentUniqueCount = 0; + const compressionDictName = count > 0 ? resolveTermContentZstdDictName((/** @type {import('dictionary-database').DatabaseTermEntry} */ (items[start])).dictionary) : null; + + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + let stagedRows = []; + /** @type {number[]} */ + let stagedPendingContentIndexes = []; + /** @type {number[]} */ + let stagedContentOffsets = []; + /** @type {number[]} */ + let stagedContentLengths = []; + /** @type {(string|null)[]} */ + let stagedContentDictNames = []; + /** @type {(string|null)[]} */ + let pendingContentHashes = []; + /** @type {number[]} */ + let pendingContentHash1s = []; + /** @type {number[]} */ + let pendingContentHash2s = []; + /** @type {Uint8Array[]} */ + let pendingContentBytes = []; + /** @type {Map|null} */ + let pendingContentRowIndexByHash = shouldDedupWithinBatch ? new Map() : null; + /** @type {Map>|null} */ + let pendingContentRowIndexByHashPair = shouldDedupWithinBatch ? new Map() : null; + + if (useLocalTransaction) { + await this._beginImmediateTransaction(db); + } + try { + const flushStagedRows = async () => { + if (stagedRows.length === 0) { + stagedPendingContentIndexes = []; + stagedContentOffsets = []; + stagedContentLengths = []; + stagedContentDictNames = []; + pendingContentHashes = []; + pendingContentHash1s = []; + pendingContentHash2s = []; + pendingContentBytes = []; + if (pendingContentRowIndexByHash !== null) { + pendingContentRowIndexByHash.clear(); + } + if (pendingContentRowIndexByHashPair !== null) { + pendingContentRowIndexByHashPair.clear(); + } + return; + } + + if (pendingContentBytes.length > 0) { + totalPendingContentUniqueCount += pendingContentBytes.length; + const tCompressStart = safePerformance.now(); + const storageChunks = this._createTermContentStorageChunks( + pendingContentBytes, + compressionDictName, + stagedRows.map((row, index) => { + const pendingIndex = stagedPendingContentIndexes[index]; + return pendingIndex >= 0 ? (row.termEntryContentDictName ?? null) : null; + }), + ); + compressContentMs += safePerformance.now() - tCompressStart; + const tAppendStart = safePerformance.now(); + const spans = await this._termContentStore.appendBatch(storageChunks.storedChunks); + for (const chunk of storageChunks.storedChunks) { + appendedContentBytes += chunk.byteLength; + } + appendContentMs += safePerformance.now() - tAppendStart; + + for (let i = 0, ii = stagedRows.length; i < ii; ++i) { + const pendingIndex = stagedPendingContentIndexes[i]; + if (pendingIndex < 0) { continue; } + const span = spans[storageChunks.entryToStoredChunkIndexes[pendingIndex]]; + if (typeof span === 'undefined') { + throw new Error('Failed to resolve staged term entry content span for bulk term insert'); + } + stagedPendingContentIndexes[i] = -1; + stagedContentOffsets[i] = span.offset; + stagedContentLengths[i] = span.length; + stagedContentDictNames[i] = storageChunks.contentDictNames[pendingIndex] ?? 'raw'; + } + + for (let i = 0, ii = pendingContentBytes.length; i < ii; i += contentBatchSize) { + const chunkCount = Math.min(contentBatchSize, ii - i); + const tContentSqlStart = safePerformance.now(); + for (let j = i, jj = i + chunkCount; j < jj; ++j) { + const span = spans[storageChunks.entryToStoredChunkIndexes[j]]; + const contentDictName = storageChunks.contentDictNames[j]; + this._cacheTermEntryContentMeta( + pendingContentHashes[j], + span.offset, + span.length, + contentDictName, + 0, + pendingContentHash1s[j], + pendingContentHash2s[j], + ); + } + const contentBatchMs = safePerformance.now() - tContentSqlStart; + insertContentSqlMs += contentBatchMs; + contentBatchDurationsMs.push(contentBatchMs); + } + } + + for (let i = 0, ii = stagedRows.length; i < ii; i += termBatchSize) { + const chunkCount = Math.min(termBatchSize, ii - i); + const tTermSqlStart = safePerformance.now(); + const split = await this._insertResolvedImportTermEntries(stagedRows, stagedContentOffsets, stagedContentLengths, stagedContentDictNames, i, chunkCount); + const termBatchMs = safePerformance.now() - tTermSqlStart; + insertTermsSqlMs += termBatchMs; + insertTermRecordAppendMs += split.termRecordAppendMs; + insertTermsVtabMs += split.termsVtabInsertMs; + termBatchDurationsMs.push(termBatchMs); + + const batchRowsPerSecond = termBatchMs > 0 ? ((chunkCount * 1000) / termBatchMs) : 0; + termBatchRowsPerSecond.push(batchRowsPerSecond); + insertedRowCount += chunkCount; + if (this._importDebugLogging && termBatchMs >= this._termBulkAddFailFastSlowBatchMs) { + throw new Error(`term batch stalled: rows=${chunkCount} elapsed=${termBatchMs.toFixed(1)}ms`); + } + + if (this._importDebugLogging && termBatchRowsPerSecond.length >= this._termBulkAddFailFastWindowSize) { + const windowStart = termBatchRowsPerSecond.length - this._termBulkAddFailFastWindowSize; + const window = termBatchRowsPerSecond.slice(windowStart); + const windowAverageRowsPerSecond = window.reduce((sum, value) => sum + value, 0) / window.length; + if (insertedRowCount >= this._termBulkAddFailFastMinRowsBeforeCheck && windowAverageRowsPerSecond < this._termBulkAddFailFastMinRowsPerSecond) { + ++failFastConsecutiveLowThroughputWindows; + if (failFastConsecutiveLowThroughputWindows >= 3) { + throw new Error( + `term batch throughput degraded: window_avg_rps=${windowAverageRowsPerSecond.toFixed(1)} ` + + `threshold=${this._termBulkAddFailFastMinRowsPerSecond.toFixed(1)} rows=${insertedRowCount}/${count}`, + ); + } + } else { + failFastConsecutiveLowThroughputWindows = 0; + } + } + } + + stagedRows = []; + stagedPendingContentIndexes = []; + stagedContentOffsets = []; + stagedContentLengths = []; + stagedContentDictNames = []; + pendingContentHashes = []; + pendingContentHash1s = []; + pendingContentHash2s = []; + pendingContentBytes = []; + pendingContentRowIndexByHash = shouldDedupWithinBatch ? new Map() : null; + pendingContentRowIndexByHashPair = shouldDedupWithinBatch ? new Map() : null; + }; + + for (let i = start, ii = start + count; i < ii; ++i) { + ++processedRowCount; + const row = /** @type {import('dictionary-database').DatabaseTermEntry} */ (items[i]); + const tComputeStart = safePerformance.now(); + const precomputedHash = (typeof row.termEntryContentHash === 'string' && row.termEntryContentHash.length > 0) ? row.termEntryContentHash : null; + const precomputedHash1 = Number.isInteger(row.termEntryContentHash1) ? (/** @type {number} */ (row.termEntryContentHash1) >>> 0) : -1; + const precomputedHash2 = Number.isInteger(row.termEntryContentHash2) ? (/** @type {number} */ (row.termEntryContentHash2) >>> 0) : -1; + const hasPrecomputedHashPair = precomputedHash1 >= 0 && precomputedHash2 >= 0; + const precomputedBytes = row.termEntryContentBytes instanceof Uint8Array ? row.termEntryContentBytes : this._getRawTermContentBytesIfAvailable(row); + let contentHash = precomputedHash; + let contentHash1 = precomputedHash1; + let contentHash2 = precomputedHash2; + let contentBytes = precomputedBytes; + if ((contentHash === null && !hasPrecomputedHashPair) || contentBytes === null) { + const rules = row.rules; + const definitionTags = row.definitionTags ?? row.tags ?? ''; + const termTags = row.termTags ?? ''; + const contentJson = row.termEntryContentJson ?? this._serializeTermEntryContent(rules, definitionTags, termTags, row.glossary); + contentHash = contentHash ?? this._hashEntryContent(contentJson); + contentBytes = contentBytes ?? this._textEncoder.encode(contentJson); + if (contentHash1 < 0 || contentHash2 < 0) { + const hashPair = parseContentHashHexPair(contentHash); + if (hashPair !== null) { + [contentHash1, contentHash2] = hashPair; + } + } + } + computeContentMs += safePerformance.now() - tComputeStart; + + let existingMeta = (contentHash1 >= 0 && contentHash2 >= 0) ? + this._getTermEntryContentMetaByHashPair(contentHash1, contentHash2) : + void 0; + if (typeof existingMeta === 'undefined' && contentHash !== null) { + existingMeta = this._termEntryContentMetaByHash.get(contentHash); + } + if (typeof existingMeta !== 'undefined') { + ++resolvedFromCacheCount; + stagedRows.push(row); + stagedPendingContentIndexes.push(-1); + stagedContentOffsets.push(existingMeta.offset); + stagedContentLengths.push(existingMeta.length); + stagedContentDictNames.push(existingMeta.dictName); + continue; + } + + let pendingContentIndex = -1; + if (pendingContentRowIndexByHashPair !== null && contentHash1 >= 0 && contentHash2 >= 0) { + const pendingContentRowIndexByHash2 = pendingContentRowIndexByHashPair.get(contentHash1); + if (typeof pendingContentRowIndexByHash2 !== 'undefined') { + const existingPendingContentIndex = pendingContentRowIndexByHash2.get(contentHash2); + if (typeof existingPendingContentIndex === 'number') { + pendingContentIndex = existingPendingContentIndex; + } + } + } + if (pendingContentIndex < 0 && pendingContentRowIndexByHash !== null && contentHash !== null) { + const existingPendingContentIndex = pendingContentRowIndexByHash.get(contentHash); + if (typeof existingPendingContentIndex === 'number') { + pendingContentIndex = existingPendingContentIndex; + } + } + if (pendingContentIndex < 0) { + const tCompressStart = safePerformance.now(); + compressContentMs += safePerformance.now() - tCompressStart; + pendingContentIndex = pendingContentBytes.length; + if (pendingContentRowIndexByHash !== null && contentHash !== null) { + pendingContentRowIndexByHash.set(contentHash, pendingContentIndex); + } + if (pendingContentRowIndexByHashPair !== null && contentHash1 >= 0 && contentHash2 >= 0) { + let pendingContentRowIndexByHash2 = pendingContentRowIndexByHashPair.get(contentHash1); + if (typeof pendingContentRowIndexByHash2 === 'undefined') { + pendingContentRowIndexByHash2 = new Map(); + pendingContentRowIndexByHashPair.set(contentHash1, pendingContentRowIndexByHash2); + } + pendingContentRowIndexByHash2.set(contentHash2, pendingContentIndex); + } + pendingContentHashes.push(contentHash); + pendingContentHash1s.push(contentHash1); + pendingContentHash2s.push(contentHash2); + pendingContentBytes.push(contentBytes); + } + + stagedRows.push(row); + stagedPendingContentIndexes.push(pendingContentIndex); + stagedContentOffsets.push(-1); + stagedContentLengths.push(-1); + stagedContentDictNames.push(null); + + const tNow = safePerformance.now(); + if (this._importDebugLogging && (tNow - lastProgressLog) >= this._termBulkAddLogIntervalMs) { + lastProgressLog = tNow; + log.log( + `[yomitan-db-import] bulkAdd terms progress rows=${processedRowCount}/${count} ` + + `cached=${resolvedFromCacheCount} pendingUnique=${pendingContentBytes.length}`, + ); + } + + if (stagedRows.length >= stagingBatchSize) { + await flushStagedRows(); + } + } + await flushStagedRows(); + if (useLocalTransaction) { + const tCommitStart = safePerformance.now(); + db.exec('COMMIT'); + commitMs = safePerformance.now() - tCommitStart; + } + if (this._importDebugLogging) { + const totalMs = safePerformance.now() - tBulkStart; + const rowsPerSecond = totalMs > 0 ? ((count * 1000) / totalMs) : 0; + const bytesPerSecond = totalMs > 0 ? ((appendedContentBytes * 1000) / totalMs) : 0; + const avgTermBatchMs = this._average(termBatchDurationsMs); + const p95TermBatchMs = this._p95(termBatchDurationsMs); + const avgContentBatchMs = this._average(contentBatchDurationsMs); + const p95ContentBatchMs = this._p95(contentBatchDurationsMs); + log.log( + `[yomitan-db-import] bulkAdd terms done rows=${count} total=${totalMs.toFixed(1)}ms ` + + `compute=${computeContentMs.toFixed(1)}ms compress=${compressContentMs.toFixed(1)}ms ` + + `append=${appendContentMs.toFixed(1)}ms contentSql=${insertContentSqlMs.toFixed(1)}ms ` + + `termsSql=${insertTermsSqlMs.toFixed(1)}ms termRecordAppend=${insertTermRecordAppendMs.toFixed(1)}ms ` + + `termsVtabInsert=${insertTermsVtabMs.toFixed(1)}ms commit=${commitMs.toFixed(1)}ms ` + + `intraBatchDedup=${String(shouldDedupWithinBatch)} ` + + `recordFastPath=${String(this._termRecordRowAppendFastPath)} ` + + `stagingBatchSize=${stagingBatchSize} ` + + `cached=${resolvedFromCacheCount} newUnique=${totalPendingContentUniqueCount} ` + + `rps=${rowsPerSecond.toFixed(1)} bps=${bytesPerSecond.toFixed(1)} ` + + `termBatchAvg=${avgTermBatchMs.toFixed(1)}ms termBatchP95=${p95TermBatchMs.toFixed(1)}ms ` + + `contentBatchAvg=${avgContentBatchMs.toFixed(1)}ms contentBatchP95=${p95ContentBatchMs.toFixed(1)}ms`, + ); + } + } catch (e) { + if (useLocalTransaction) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + } + throw e; + } + } + + /** + * @param {import('@sqlite.org/sqlite-wasm').Bindable[][]} rows + * @param {number} start + * @param {number} count + * @returns {Promise<{termRecordAppendMs: number, termsVtabInsertMs: number}>} + */ + async _insertResolvedTermRows(rows, start, count) { + const tRecordAppendStart = safePerformance.now(); + if (this._termRecordRowAppendFastPath) { + await this._termRecordStore.appendBatchFromTermRows(rows, start, count); + } else { + /** @type {{dictionary: string, expression: string, reading: string, expressionReverse: string|null, readingReverse: string|null, entryContentOffset: number, entryContentLength: number, entryContentDictName: string|null, score: number, sequence: number|null}[]} */ + const records = []; + for (let i = start, ii = start + count; i < ii; ++i) { + const row = rows[i]; + records.push({ + dictionary: this._asString(row[0]), + expression: this._asString(row[1]), + reading: this._asString(row[2]), + expressionReverse: this._asNullableString(row[3]) ?? null, + readingReverse: this._asNullableString(row[4]) ?? null, + entryContentOffset: this._asNumber(row[6], -1), + entryContentLength: this._asNumber(row[7], -1), + entryContentDictName: this._asNullableString(row[8]), + score: this._asNumber(row[12], 0), + sequence: this._asNullableNumber(row[14]) ?? null, + }); + } + await this._termRecordStore.appendBatch(records); + } + const termRecordAppendMs = safePerformance.now() - tRecordAppendStart; + let termsVtabInsertMs = 0; + const deferVirtualTableWrite = this._deferTermsVirtualTableSync || this._bulkImportDepth > 0; + if (deferVirtualTableWrite) { + this._termsVirtualTableDirty = true; + } else { + const tVtabStart = safePerformance.now(); + await this._insertTermRowsIntoVirtualTable(count); + termsVtabInsertMs = safePerformance.now() - tVtabStart; + } + return {termRecordAppendMs, termsVtabInsertMs}; + } + + /** + * @param {import('dictionary-database').DatabaseTermEntry[]} rows + * @param {number[]} contentOffsets + * @param {number[]} contentLengths + * @param {(string|null)[]} contentDictNames + * @param {number} start + * @param {number} count + * @returns {Promise<{termRecordAppendMs: number, termRecordEncodeMs: number, termRecordWriteMs: number, termsVtabInsertMs: number}>} + */ + async _insertResolvedImportTermEntries(rows, contentOffsets, contentLengths, contentDictNames, start, count) { + const tRecordAppendStart = safePerformance.now(); + let termRecordEncodeMs = 0; + let termRecordWriteMs = 0; + if (this._termRecordRowAppendFastPath) { + const metrics = await this._termRecordStore.appendBatchFromResolvedImportTermEntries(rows, start, count, contentOffsets, contentLengths, contentDictNames); + termRecordEncodeMs = metrics.encodeMs; + termRecordWriteMs = metrics.appendWriteMs; + } else { + /** @type {{dictionary: string, expression: string, reading: string, expressionReverse: string|null, readingReverse: string|null, entryContentOffset: number, entryContentLength: number, entryContentDictName: string|null, score: number, sequence: number|null}[]} */ + const records = []; + for (let i = start, ii = start + count; i < ii; ++i) { + const row = rows[i]; + records.push({ + dictionary: row.dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse ?? null, + readingReverse: row.readingReverse ?? null, + entryContentOffset: contentOffsets[i], + entryContentLength: contentLengths[i], + entryContentDictName: contentDictNames[i], + score: row.score, + sequence: typeof row.sequence === 'number' ? row.sequence : null, + }); + } + await this._termRecordStore.appendBatch(records); + } + const termRecordAppendMs = safePerformance.now() - tRecordAppendStart; + let termsVtabInsertMs = 0; + const deferVirtualTableWrite = this._deferTermsVirtualTableSync || this._bulkImportDepth > 0; + if (deferVirtualTableWrite) { + this._termsVirtualTableDirty = true; + } else { + const tVtabStart = safePerformance.now(); + await this._insertTermRowsIntoVirtualTable(count); + termsVtabInsertMs = safePerformance.now() - tVtabStart; + } + return {termRecordAppendMs, termRecordEncodeMs, termRecordWriteMs, termsVtabInsertMs}; + } + + /** + * @param {number} count + * @returns {Promise} + */ + async _insertTermRowsIntoVirtualTable(count) { + this._termsVirtualTableDirty = count > 0; + } + + /** + * @param {{values: import('@sqlite.org/sqlite-wasm').Bindable[], contentKey: string|null}[]} rows + * @throws {Error} + */ + async _insertResolvedTermRowsWithContentKeys(rows) { + for (const row of rows) { + const {contentKey} = row; + if (contentKey !== null) { + const contentId = this._termEntryContentIdByKey.get(contentKey); + if (typeof contentId !== 'number') { + throw new Error('Failed to resolve term entry content id for batched insert'); + } + const meta = this._termEntryContentMetaByHash.get(contentKey); + if (typeof meta === 'undefined') { + throw new Error('Failed to resolve term entry content metadata for batched insert'); + } + row.values[5] = contentId; + row.values[6] = meta.offset; + row.values[7] = meta.length; + row.values[8] = meta.dictName; + } + } + await this._insertResolvedTermRows(rows.map((row) => row.values), 0, rows.length); + } + + /** */ + _clearTermEntryContentMetaCaches() { + this._termEntryContentMetaByHash.clear(); + this._termEntryContentMetaByHashPair.clear(); + } + + /** + * @param {number} hash1 + * @param {number} hash2 + * @returns {{id: number, offset: number, length: number, dictName: string}|undefined} + */ + _getTermEntryContentMetaByHashPair(hash1, hash2) { + const byHash2 = this._termEntryContentMetaByHashPair.get(hash1 >>> 0); + return typeof byHash2 !== 'undefined' ? byHash2.get(hash2 >>> 0) : void 0; + } + + /** + * @param {number} hash1 + * @param {number} hash2 + * @param {{id: number, offset: number, length: number, dictName: string}} meta + */ + _setTermEntryContentMetaByHashPair(hash1, hash2, meta) { + hash1 >>>= 0; + hash2 >>>= 0; + let byHash2 = this._termEntryContentMetaByHashPair.get(hash1); + if (typeof byHash2 === 'undefined') { + byHash2 = new Map(); + this._termEntryContentMetaByHashPair.set(hash1, byHash2); + } + byHash2.set(hash2, meta); + } + + /** + * @param {string|null} contentHash + * @param {number} offset + * @param {number} length + * @param {string|null|undefined} dictName + * @param {number} [id] + * @param {number} [hash1] + * @param {number} [hash2] + * @returns {{id: number, offset: number, length: number, dictName: string}} + */ + _cacheTermEntryContentMeta(contentHash, offset, length, dictName, id = 0, hash1 = -1, hash2 = -1) { + const meta = {id, offset, length, dictName: dictName ?? 'raw'}; + if (typeof contentHash === 'string' && contentHash.length > 0) { + this._termEntryContentMetaByHash.set(contentHash, meta); + if (hash1 < 0 || hash2 < 0) { + const parsedHashPair = parseContentHashHexPair(contentHash); + if (parsedHashPair !== null) { + [hash1, hash2] = parsedHashPair; + } + } + } + if (hash1 >= 0 && hash2 >= 0) { + this._setTermEntryContentMetaByHashPair(hash1, hash2, meta); + } + return meta; + } + + /** + * @param {{contentKey: string, contentHash: string, contentBytes: Uint8Array, contentDictName: string|null}[]} rows + * @throws {Error} + */ + async _insertTermEntryContentBatch(rows) { + if (rows.length === 0) { return; } + const spans = await this._termContentStore.appendBatch(rows.map((row) => row.contentBytes)); + this._insertTermEntryContentBatchWithSpans(rows, spans, 0, rows.length); + } + + /** + * @param {{contentHash: string, contentDictName: string|null}[]} rows + * @param {{offset: number, length: number}[]} spans + * @param {number} start + * @param {number} count + * @throws {Error} + */ + _insertTermEntryContentBatchWithSpans(rows, spans, start, count) { + if (count <= 0) { return; } + /** @type {string[]} */ + const valueRows = []; + /** @type {import('@sqlite.org/sqlite-wasm').Bindable[]} */ + const bind = []; + for (let i = start, ii = start + count; i < ii; ++i) { + const row = rows[i]; + const span = spans[i]; + valueRows.push('(?, NULL, ?, \'\', \'\', \'\', \'[]\', ?, ?)'); + bind.push(row.contentHash, row.contentDictName, span.offset, span.length); + } + const sql = ` + INSERT INTO termEntryContent(contentHash, contentZstd, contentDictName, rules, definitionTags, termTags, glossaryJson, contentOffset, contentLength) + VALUES ${valueRows.join(',')} + `; + const stmt = this._getCachedStatement(sql); + stmt.reset(true); + stmt.bind(bind); + stmt.step(); + + const db = this._requireDb(); + const lastInsertRowId = this._asNumber(db.selectValue('SELECT last_insert_rowid()'), -1); + if (lastInsertRowId <= 0) { + throw new Error('Failed to insert batched term entry content'); + } + const firstId = lastInsertRowId - count + 1; + for (let i = start, ii = start + count; i < ii; ++i) { + const id = firstId + (i - start); + this._termEntryContentIdByHash.set(rows[i].contentHash, id); + this._termEntryContentIdByKey.set(rows[i].contentHash, id); + this._cacheTermEntryContentMeta(rows[i].contentHash, spans[i].offset, spans[i].length, rows[i].contentDictName, id); + } + } + + /** + * @param {import('dictionary-database').ObjectStoreData<'terms'>[]} items + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async _bulkAddTermsWithoutContentDedup(items, start, count) { + const useLocalTransaction = !this._bulkImportTransactionOpen; + const batchSize = this._getTermBulkAddBatchSizeForCount(count); + let contentAppendMs = 0; + let termRecordBuildMs = 0; + let termRecordEncodeMs = 0; + let termRecordWriteMs = 0; + let termsVtabInsertMs = 0; + + if (useLocalTransaction) { + await this._beginImmediateTransaction(this._requireDb()); + } + try { + for (let i = start, ii = start + count; i < ii; i += batchSize) { + const chunkCount = Math.min(batchSize, ii - i); + /** @type {Uint8Array[]} */ + const contentChunks = new Array(chunkCount); + /** @type {number[]} */ + const contentOffsets = new Array(chunkCount); + /** @type {number[]} */ + const contentLengths = new Array(chunkCount); + for (let j = 0; j < chunkCount; ++j) { + const row = /** @type {import('dictionary-database').DatabaseTermEntry} */ (items[i + j]); + const precomputedContentBytes = row.termEntryContentBytes instanceof Uint8Array ? row.termEntryContentBytes : this._getRawTermContentBytesIfAvailable(row); + if (precomputedContentBytes instanceof Uint8Array) { + contentChunks[j] = precomputedContentBytes; + continue; + } + const rules = row.rules ?? ''; + const definitionTags = row.definitionTags ?? row.tags ?? ''; + const termTags = row.termTags ?? ''; + const contentJson = row.termEntryContentJson ?? this._serializeTermEntryContent(rules, definitionTags, termTags, row.glossary); + contentChunks[j] = this._textEncoder.encode(contentJson); + } + let chunksToAppend = contentChunks; + const tContentAppendStart = safePerformance.now(); + if (this._termContentStorageMode === TERM_CONTENT_STORAGE_MODE_RAW_BYTES) { + const {packedChunks, sourceChunkIndices, sourceChunkLocalOffsets} = packContentChunksIntoSlabs( + contentChunks, + this._rawTermContentPackTargetBytes, + ); + chunksToAppend = packedChunks; + /** @type {number[]} */ + const packedOffsets = new Array(packedChunks.length); + /** @type {number[]} */ + const packedLengths = new Array(packedChunks.length); + await this._termContentStore.appendBatchToArrays(packedChunks, packedOffsets, packedLengths); + for (let j = 0; j < chunkCount; ++j) { + const packedIndex = sourceChunkIndices[j]; + contentOffsets[j] = packedOffsets[packedIndex] + sourceChunkLocalOffsets[j]; + contentLengths[j] = contentChunks[j].byteLength; + } + } else { + await this._termContentStore.appendBatchToArrays(contentChunks, contentOffsets, contentLengths); + } + contentAppendMs += safePerformance.now() - tContentAppendStart; + const explicitContentDictName = chunkCount > 0 ? (items[i].termEntryContentDictName ?? null) : null; + let contentDictName = 'raw'; + if ( + this._termContentStorageMode === TERM_CONTENT_STORAGE_MODE_RAW_BYTES && + typeof explicitContentDictName === 'string' && + explicitContentDictName.length > 0 + ) { + contentDictName = explicitContentDictName; + } else if ( + this._termContentStorageMode === TERM_CONTENT_STORAGE_MODE_RAW_BYTES && + chunksToAppend.every((contentBytes) => isRawTermContentBinary(contentBytes)) + ) { + contentDictName = RAW_TERM_CONTENT_DICT_NAME; + } + const metrics = await this._termRecordStore.appendBatchFromImportTermEntriesResolvedContent(items, i, chunkCount, contentOffsets, contentLengths, contentDictName); + termRecordBuildMs += metrics.buildRecordsMs; + termRecordEncodeMs += metrics.encodeMs; + termRecordWriteMs += metrics.appendWriteMs; + const deferVirtualTableWrite = this._deferTermsVirtualTableSync || this._bulkImportDepth > 0; + if (deferVirtualTableWrite) { + this._termsVirtualTableDirty = true; + } else { + const tTermsVtabInsertStart = safePerformance.now(); + await this._insertTermRowsIntoVirtualTable(chunkCount); + termsVtabInsertMs += safePerformance.now() - tTermsVtabInsertStart; + } + } + if (useLocalTransaction) { + this._requireDb().exec('COMMIT'); + } + this._lastBulkAddTermsMetrics = { + contentAppendMs, + termRecordBuildMs, + termRecordEncodeMs, + termRecordWriteMs, + termsVtabInsertMs, + }; + if (this._importDebugLogging) { + log.log( + `[yomitan-db-import] bulkAdd terms no-dedup contentAppend=${contentAppendMs.toFixed(1)}ms ` + + `termRecordBuild=${termRecordBuildMs.toFixed(1)}ms ` + + `termRecordEncode=${termRecordEncodeMs.toFixed(1)}ms ` + + `termRecordWrite=${termRecordWriteMs.toFixed(1)}ms ` + + `termsVtabInsert=${termsVtabInsertMs.toFixed(1)}ms`, + ); + } + } catch (e) { + if (useLocalTransaction) { + try { this._requireDb().exec('ROLLBACK'); } catch (_) { /* NOP */ } + } + throw e; + } + } + + /** + * @param {import('@sqlite.org/sqlite-wasm').PreparedStatement} insertContentStmt + * @param {string} contentHash + * @param {Uint8Array} contentZstd + * @param {string|null} contentDictName + * @param {string} contentKey + * @returns {Promise} + * @throws {Error} + */ + async _resolveOrCreateTermEntryContentId(insertContentStmt, contentHash, contentZstd, contentDictName, contentKey) { + const cachedId = this._termEntryContentIdByKey.get(contentKey); + if (typeof cachedId === 'number') { + return cachedId; + } + const cachedHashId = this._termEntryContentIdByHash.get(contentHash); + if (typeof cachedHashId === 'number') { + this._termEntryContentIdByKey.set(contentKey, cachedHashId); + if (!this._termEntryContentMetaByHash.has(contentHash)) { + const stmt = this._getCachedStatement('SELECT contentOffset, contentLength, contentDictName FROM termEntryContent WHERE id = $id LIMIT 1'); + stmt.reset(true); + stmt.bind({$id: cachedHashId}); + if (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + const offset = this._asNumber(row.contentOffset, -1); + const length = this._asNumber(row.contentLength, -1); + const dictName = this._asNullableString(row.contentDictName) ?? 'raw'; + if (offset >= 0 && length > 0) { + this._cacheTermEntryContentMeta(contentHash, offset, length, dictName, cachedHashId); + } + } + } + if (this._termEntryContentMetaByHash.has(contentHash)) { + return cachedHashId; + } + } + + insertContentStmt.reset(true); + const [span] = await this._termContentStore.appendBatch([contentZstd]); + insertContentStmt.bind({ + $contentHash: contentHash, + $contentDictName: contentDictName, + $contentOffset: span.offset, + $contentLength: span.length, + }); + insertContentStmt.step(); + + const db = this._requireDb(); + const id = this._asNumber(db.selectValue('SELECT last_insert_rowid()'), -1); + if (id <= 0) { + throw new Error('Failed to insert term entry content'); + } + this._termEntryContentIdByHash.set(contentHash, id); + this._termEntryContentIdByKey.set(contentKey, id); + this._cacheTermEntryContentMeta(contentHash, span.offset, span.length, contentDictName, id); + return id; + } + + /** */ + _loadTermEntryContentHashIndex() { + if (this._termEntryContentIdByHash.size > 0) { return; } + const stmt = this._getCachedStatement('SELECT id, contentHash, contentOffset, contentLength, contentDictName FROM termEntryContent'); + stmt.reset(true); + while (stmt.step()) { + const row = /** @type {import('core').SafeAny[]} */ (stmt.get([])); + const id = this._asNumber(row[0], -1); + if (id <= 0) { continue; } + const contentHash = this._asString(row[1]); + if (contentHash.length === 0) { continue; } + const offset = this._asNumber(row[2], -1); + const length = this._asNumber(row[3], -1); + const dictName = this._asNullableString(row[4]) ?? 'raw'; + if (offset >= 0 && length > 0) { + if (!this._termEntryContentIdByHash.has(contentHash)) { + this._termEntryContentIdByHash.set(contentHash, id); + } + this._cacheTermEntryContentMeta(contentHash, offset, length, dictName, id); + } + } + } + + /** */ + _pruneOrphanTermEntryContent() { + const db = this._requireDb(); + db.exec(` + DELETE FROM termEntryContent + WHERE id NOT IN ( + SELECT DISTINCT entryContentId + FROM terms + WHERE entryContentId IS NOT NULL + ) + `); + } + + // Parent-Worker API + + /** + * @param {MessagePort} port + */ + async connectToDatabaseWorker(port) { + if (this._worker !== null) { + this._worker.postMessage({action: 'connectToDatabaseWorker'}, [port]); + return; + } + + port.onmessage = (/** @type {MessageEvent} */ event) => { + const {action, params} = event.data; + return invokeApiMapHandler(this._apiMap, action, params, [port], () => {}); + }; + port.onmessageerror = (event) => { + const error = new ExtensionError('DictionaryDatabase: Error receiving message from main thread'); + error.data = event; + log.error(error); + }; + } + + /** @type {import('dictionary-database').ApiHandler<'drawMedia'>} */ + _onDrawMedia(params, port) { + void this.drawMedia(params.requests, port); + } + + // Private + + /** + * @returns {Promise} + */ + async _openConnection() { + this._sqlite3 = await getSqlite3(); + try { + this._db = await openOpfsDatabase('DictionaryDatabase._openConnection'); + } catch (error) { + const diagnostics = getLastOpenStorageDiagnostics(); + const message = (error instanceof Error) ? error.message : String(error); + throw new Error(`Dictionary database open failed: ${message}. diagnostics=${JSON.stringify(diagnostics)}`); + } + this._usesFallbackStorage = didLastOpenUseFallbackStorage(); + this._openStorageDiagnostics = getLastOpenStorageDiagnostics(); + await this._termContentStore.prepare(); + await this._termRecordStore.prepare(); + this._clearTermsVtabCursorState(); + this._termsVtabModuleRegistered = false; + + this._applyRuntimePragmas(); + + await this._initializeSchema(); + await this._runSchemaMigrations(); + } + + /** */ + async _initializeSchema() { + const db = this._requireDb(); + db.exec(` + CREATE TABLE IF NOT EXISTS dictionaries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + version INTEGER NOT NULL, + summaryJson TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS termEntryContent ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + contentHash TEXT NOT NULL, + contentZstd BLOB, + contentOffset INTEGER, + contentLength INTEGER, + contentDictName TEXT, + rules TEXT NOT NULL, + definitionTags TEXT NOT NULL, + termTags TEXT NOT NULL, + glossaryJson TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS termMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + expression TEXT NOT NULL, + mode TEXT NOT NULL, + dataJson TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS kanji ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + character TEXT NOT NULL, + onyomi TEXT, + kunyomi TEXT, + tags TEXT, + meaningsJson TEXT NOT NULL, + statsJson TEXT + ); + + CREATE TABLE IF NOT EXISTS kanjiMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + character TEXT NOT NULL, + mode TEXT NOT NULL, + dataJson TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS tagMeta ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + name TEXT NOT NULL, + category TEXT, + ord INTEGER, + notes TEXT, + score INTEGER + ); + + CREATE TABLE IF NOT EXISTS media ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dictionary TEXT NOT NULL, + path TEXT NOT NULL, + mediaType TEXT NOT NULL, + width INTEGER NOT NULL, + height INTEGER NOT NULL, + content BLOB NOT NULL + ); + + CREATE TABLE IF NOT EXISTS sharedGlossaryArtifacts ( + dictionary TEXT PRIMARY KEY, + contentOffset INTEGER NOT NULL, + contentLength INTEGER NOT NULL, + contentDictName TEXT NOT NULL, + uncompressedLength INTEGER NOT NULL + ); + `); + await this._migrateTermsContentSchema(); + await this._ensureTermsVirtualTable(); + if (!this._enableSqliteSecondaryIndexes) { + for (const dropIndexSql of this._createDropIndexesSql()) { + db.exec(dropIndexSql); + } + } + for (const createIndexSql of this._createIndexesSql()) { + db.exec(createIndexSql); + } + } + + /** */ + async _runSchemaMigrations() { + const db = this._requireDb(); + const installedSchemaVersion = Math.max(0, this._asNumber(db.selectValue('PRAGMA user_version'), 0)); + if (installedSchemaVersion > CURRENT_DICTIONARY_SCHEMA_VERSION) { + reportDiagnostics('dictionary-schema-migration-skipped', { + reason: 'newer-installed-version', + installedSchemaVersion, + currentSchemaVersion: CURRENT_DICTIONARY_SCHEMA_VERSION, + }); + return; + } + let currentSchemaVersion = installedSchemaVersion; + let migrationCount = 0; + while (currentSchemaVersion < CURRENT_DICTIONARY_SCHEMA_VERSION) { + const nextVersion = currentSchemaVersion + 1; + const migrationStart = safePerformance.now(); + const migrationSummary = await this._runSchemaMigrationToVersion(nextVersion); + db.exec(`PRAGMA user_version = ${nextVersion}`); + ++migrationCount; + reportDiagnostics('dictionary-schema-migration-applied', { + fromVersion: currentSchemaVersion, + toVersion: nextVersion, + elapsedMs: Math.max(0, safePerformance.now() - migrationStart), + summary: migrationSummary, + }); + currentSchemaVersion = nextVersion; + } + reportDiagnostics('dictionary-schema-migration-summary', { + installedSchemaVersion, + currentSchemaVersion, + migrationCount, + }); + } + + /** + * @param {number} version + * @returns {Promise>} + */ + async _runSchemaMigrationToVersion(version) { + switch (version) { + case 1: + return await this._wipeDictionaryDataForSchemaMigration('wipe-unversioned-dictionary-data'); + case 2: + return await this._migrateSchemaVersion2(); + case 3: + return await this._wipeDictionaryDataForSchemaMigration('reset-dictionary-data-for-raw-v3'); + case 4: + return await this._wipeDictionaryDataForSchemaMigration('reset-dictionary-data-for-raw-v4'); + default: + throw new Error(`Unhandled dictionary schema migration target version: ${version}`); + } + } + + /** + * Migration v1: reset all imported dictionary data when legacy installs had no schema version. + * @param {string} migration + * @returns {Promise>} + */ + async _wipeDictionaryDataForSchemaMigration(migration) { + const db = this._requireDb(); + const dictionariesBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM dictionaries'), 0); + const termMetaBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM termMeta'), 0); + const kanjiBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM kanji'), 0); + const kanjiMetaBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM kanjiMeta'), 0); + const tagMetaBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM tagMeta'), 0); + const mediaBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM media'), 0); + const termContentBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM termEntryContent'), 0); + const sharedGlossaryArtifactsBefore = this._asNumber(db.selectValue('SELECT COUNT(*) FROM sharedGlossaryArtifacts'), 0); + const termRecordsBefore = this._termRecordStore.size; + + await this._termContentStore.reset(); + await this._termRecordStore.reset(); + await this._beginImmediateTransaction(db); + try { + db.exec('DELETE FROM media'); + db.exec('DELETE FROM tagMeta'); + db.exec('DELETE FROM kanjiMeta'); + db.exec('DELETE FROM kanji'); + db.exec('DELETE FROM termMeta'); + db.exec('DELETE FROM termEntryContent'); + db.exec('DELETE FROM sharedGlossaryArtifacts'); + db.exec('DELETE FROM dictionaries'); + db.exec('COMMIT'); + } catch (e) { + try { db.exec('ROLLBACK'); } catch (_) { /* NOP */ } + throw e; + } + + this._termEntryContentCache.clear(); + this._termEntryContentIdByHash.clear(); + this._clearTermEntryContentMetaCaches(); + this._termExactPresenceCache.clear(); + this._termPrefixNegativeCache.clear(); + this._directTermIndexByDictionary.clear(); + this._termEntryContentIdByKey.clear(); + this._sharedGlossaryArtifactMetaByDictionary.clear(); + this._sharedGlossaryArtifactInflatedByDictionary.clear(); + this._termsVirtualTableDirty = false; + this._deferTermsVirtualTableSync = false; + + return { + migration, + dictionariesBefore, + termRecordsBefore, + termContentBefore, + termMetaBefore, + kanjiBefore, + kanjiMetaBefore, + tagMetaBefore, + mediaBefore, + sharedGlossaryArtifactsBefore, + }; + } + + /** + * Migration v2: reserved scaffold for future schema changes. + * @returns {Promise>} + */ + async _migrateSchemaVersion2() { + await Promise.resolve(); + return { + migration: 'schema-v2-noop', + }; + } + + /** + * @returns {string[]} + */ + _createIndexesSql() { + if (!this._enableSqliteSecondaryIndexes) { + return []; + } + return [ + 'CREATE INDEX IF NOT EXISTS idx_dictionaries_title ON dictionaries(title)', + 'CREATE INDEX IF NOT EXISTS idx_dictionaries_version ON dictionaries(version)', + 'CREATE INDEX IF NOT EXISTS idx_term_entry_content_hash ON termEntryContent(contentHash)', + 'CREATE INDEX IF NOT EXISTS idx_term_meta_expression_dictionary ON termMeta(expression, dictionary)', + 'CREATE INDEX IF NOT EXISTS idx_kanji_character_dictionary ON kanji(character, dictionary)', + 'CREATE INDEX IF NOT EXISTS idx_kanji_meta_character_dictionary ON kanjiMeta(character, dictionary)', + 'CREATE INDEX IF NOT EXISTS idx_tag_meta_dictionary_name ON tagMeta(dictionary, name)', + 'CREATE INDEX IF NOT EXISTS idx_media_dictionary_path ON media(dictionary, path)', + ]; + } + + /** + * @returns {string[]} + */ + _createDropIndexesSql() { + return [ + 'DROP INDEX IF EXISTS idx_dictionaries_title', + 'DROP INDEX IF EXISTS idx_dictionaries_version', + 'DROP INDEX IF EXISTS idx_term_entry_content_hash', + 'DROP INDEX IF EXISTS idx_term_meta_expression_dictionary', + 'DROP INDEX IF EXISTS idx_kanji_character_dictionary', + 'DROP INDEX IF EXISTS idx_kanji_meta_character_dictionary', + 'DROP INDEX IF EXISTS idx_tag_meta_dictionary_name', + 'DROP INDEX IF EXISTS idx_media_dictionary_path', + // Legacy index names from pre-optimization schema revisions. + 'DROP INDEX IF EXISTS idx_terms_expression', + 'DROP INDEX IF EXISTS idx_terms_reading', + 'DROP INDEX IF EXISTS idx_terms_sequence', + 'DROP INDEX IF EXISTS idx_terms_expression_reverse', + 'DROP INDEX IF EXISTS idx_terms_reading_reverse', + 'DROP INDEX IF EXISTS idx_term_meta_dictionary', + 'DROP INDEX IF EXISTS idx_term_meta_expression', + 'DROP INDEX IF EXISTS idx_kanji_dictionary', + 'DROP INDEX IF EXISTS idx_kanji_character', + 'DROP INDEX IF EXISTS idx_kanji_meta_dictionary', + 'DROP INDEX IF EXISTS idx_kanji_meta_character', + 'DROP INDEX IF EXISTS idx_tag_meta_dictionary', + 'DROP INDEX IF EXISTS idx_tag_meta_name', + 'DROP INDEX IF EXISTS idx_media_dictionary', + 'DROP INDEX IF EXISTS idx_media_path', + ]; + } + + /** + * Ensures terms are represented by a SQLite virtual table while record payload metadata remains external. + */ + async _ensureTermsVirtualTable() { + const db = this._requireDb(); + this._registerTermsVirtualTableModule(); + const termsEntry = db.selectObject('SELECT type, sql FROM sqlite_master WHERE name = \'terms\''); + const termsType = typeof termsEntry === 'undefined' ? '' : this._asString(termsEntry.type); + const termsSql = typeof termsEntry === 'undefined' ? '' : this._asString(termsEntry.sql).toUpperCase(); + const isVirtualTerms = termsSql.startsWith('CREATE VIRTUAL TABLE'); + if (termsType === 'table' && !isVirtualTerms) { + await this._migrateLegacyTermsTableToExternalStore(); + db.exec('DROP TABLE terms'); + } else if (isVirtualTerms && !termsSql.includes('YOMITAN_TERMS')) { + db.exec('DROP TABLE terms'); + } + db.exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS terms USING yomitan_terms( + dictionary, + expression, + reading, + expressionReverse, + readingReverse, + entryContentId, + entryContentOffset, + entryContentLength, + entryContentDictName, + definitionTags, + termTags, + rules, + score, + glossaryJson, + sequence + ) + `); + this._termsVirtualTableDirty = false; + } + + /** + * Ensures the SQLite vtable projection matches the external term record store. + * @returns {Promise} + */ + async _syncTermsVirtualTableFromRecordStore() { + this._termsVirtualTableDirty = false; + } + + /** */ + async _migrateLegacyTermsTableToExternalStore() { + if (!this._termRecordStore.isEmpty()) { + return; + } + const db = this._requireDb(); + const termsTableInfo = db.selectObjects('PRAGMA table_info(terms)'); + const termsColumns = new Set(termsTableInfo.map((row) => this._asString(row.name))); + const hasEntryContentOffset = termsColumns.has('entryContentOffset'); + const hasEntryContentLength = termsColumns.has('entryContentLength'); + const hasEntryContentDictName = termsColumns.has('entryContentDictName'); + const hasEntryContentId = termsColumns.has('entryContentId'); + const entryContentOffsetExpr = hasEntryContentOffset ? 't.entryContentOffset' : (hasEntryContentId ? 'c.contentOffset' : '-1'); + const entryContentLengthExpr = hasEntryContentLength ? 't.entryContentLength' : (hasEntryContentId ? 'c.contentLength' : '-1'); + const entryContentDictNameExpr = hasEntryContentDictName ? 't.entryContentDictName' : (hasEntryContentId ? 'c.contentDictName' : '\'raw\''); + const stmt = this._getCachedStatement(` + SELECT + t.dictionary AS dictionary, + t.expression AS expression, + t.reading AS reading, + t.expressionReverse AS expressionReverse, + t.readingReverse AS readingReverse, + ${entryContentOffsetExpr} AS entryContentOffset, + ${entryContentLengthExpr} AS entryContentLength, + COALESCE(${entryContentDictNameExpr}, 'raw') AS entryContentDictName, + t.score AS score, + t.sequence AS sequence + FROM terms t + ${hasEntryContentId ? 'LEFT JOIN termEntryContent c ON c.id = t.entryContentId' : ''} + `); + stmt.reset(true); + /** @type {{dictionary: string, expression: string, reading: string, expressionReverse: string|null, readingReverse: string|null, entryContentOffset: number, entryContentLength: number, entryContentDictName: string|null, score: number, sequence: number|null}[]} */ + let batch = []; + while (stmt.step()) { + const row = /** @type {import('core').SafeAny} */ (stmt.get({})); + batch.push({ + dictionary: this._asString(row.dictionary), + expression: this._asString(row.expression), + reading: this._asString(row.reading), + expressionReverse: this._asNullableString(row.expressionReverse) ?? null, + readingReverse: this._asNullableString(row.readingReverse) ?? null, + entryContentOffset: this._asNumber(row.entryContentOffset, -1), + entryContentLength: this._asNumber(row.entryContentLength, -1), + entryContentDictName: this._asNullableString(row.entryContentDictName), + score: this._asNumber(row.score, 0), + sequence: this._asNullableNumber(row.sequence) ?? null, + }); + if (batch.length >= 4096) { + await this._termRecordStore.appendBatch(batch); + batch = []; + } + } + if (batch.length > 0) { + await this._termRecordStore.appendBatch(batch); + } + } + + /** */ + async _migrateTermsContentSchema() { + const db = this._requireDb(); + const contentTableInfo = db.selectObjects('PRAGMA table_info(termEntryContent)'); + const hasContentZstd = contentTableInfo.some((row) => this._asString(row.name) === 'contentZstd'); + const hasContentOffset = contentTableInfo.some((row) => this._asString(row.name) === 'contentOffset'); + const hasContentLength = contentTableInfo.some((row) => this._asString(row.name) === 'contentLength'); + const hasContentDictName = contentTableInfo.some((row) => this._asString(row.name) === 'contentDictName'); + if (!hasContentZstd) { + db.exec('ALTER TABLE termEntryContent ADD COLUMN contentZstd BLOB'); + } + if (!hasContentDictName) { + db.exec('ALTER TABLE termEntryContent ADD COLUMN contentDictName TEXT'); + } + if (!hasContentOffset) { + db.exec('ALTER TABLE termEntryContent ADD COLUMN contentOffset INTEGER'); + } + if (!hasContentLength) { + db.exec('ALTER TABLE termEntryContent ADD COLUMN contentLength INTEGER'); + } + + const termsEntry = db.selectObject('SELECT type, sql FROM sqlite_master WHERE name = \'terms\''); + const termsSql = typeof termsEntry === 'undefined' ? '' : this._asString(termsEntry.sql).toUpperCase(); + const isVirtualTerms = termsSql.startsWith('CREATE VIRTUAL TABLE'); + if (typeof termsEntry === 'undefined' || this._asString(termsEntry.type) !== 'table' || isVirtualTerms) { + return; + } + + const tableInfo = db.selectObjects('PRAGMA table_info(terms)'); + const hasEntryContentId = tableInfo.some((row) => this._asString(row.name) === 'entryContentId'); + const hasEntryContentOffset = tableInfo.some((row) => this._asString(row.name) === 'entryContentOffset'); + const hasEntryContentLength = tableInfo.some((row) => this._asString(row.name) === 'entryContentLength'); + const hasEntryContentDictName = tableInfo.some((row) => this._asString(row.name) === 'entryContentDictName'); + if (!hasEntryContentId) { db.exec('ALTER TABLE terms ADD COLUMN entryContentId INTEGER'); } + if (!hasEntryContentOffset) { db.exec('ALTER TABLE terms ADD COLUMN entryContentOffset INTEGER'); } + if (!hasEntryContentLength) { db.exec('ALTER TABLE terms ADD COLUMN entryContentLength INTEGER'); } + if (!hasEntryContentDictName) { db.exec('ALTER TABLE terms ADD COLUMN entryContentDictName TEXT'); } + + db.exec(` + INSERT INTO termEntryContent(contentHash, rules, definitionTags, termTags, glossaryJson) + SELECT + '', + COALESCE(t.rules, ''), + COALESCE(t.definitionTags, ''), + COALESCE(t.termTags, ''), + COALESCE(t.glossaryJson, '[]') + FROM terms t + WHERE t.entryContentId IS NULL + `); + + const contentRows = db.selectObjects('SELECT id, rules, definitionTags, termTags, glossaryJson FROM termEntryContent WHERE contentHash = \'\''); + for (const row of contentRows) { + const id = this._asNumber(row.id, -1); + if (id <= 0) { continue; } + const rules = this._asString(row.rules); + const definitionTags = this._asString(row.definitionTags); + const termTags = this._asString(row.termTags); + const glossaryJson = this._asString(row.glossaryJson); + const contentHash = this._hashEntryContent(this._serializeTermEntryContent( + rules, + definitionTags, + termTags, + this._safeParseJson(glossaryJson, []), + )); + db.exec({ + sql: 'UPDATE termEntryContent SET contentHash = $contentHash WHERE id = $id', + bind: {$contentHash: contentHash, $id: id}, + }); + } + + db.exec(` + UPDATE terms + SET entryContentId = ( + SELECT c.id + FROM termEntryContent c + WHERE + c.rules = COALESCE(terms.rules, '') AND + c.definitionTags = COALESCE(terms.definitionTags, '') AND + c.termTags = COALESCE(terms.termTags, '') AND + c.glossaryJson = COALESCE(terms.glossaryJson, '[]') + LIMIT 1 + ) + WHERE entryContentId IS NULL + `); + + const externalizeRows = db.selectObjects(` + SELECT id, contentZstd, contentDictName, rules, definitionTags, termTags, glossaryJson + FROM termEntryContent + WHERE + (contentOffset IS NULL OR contentOffset < 0 OR contentLength IS NULL OR contentLength <= 0) + `); + if (externalizeRows.length > 0) { + const chunks = []; + /** @type {{id: number, contentDictName: string|null}[]} */ + const externalizedRows = []; + for (const row of externalizeRows) { + const id = this._asNumber(row.id, -1); + if (id <= 0) { continue; } + + let contentBytes = this._toUint8Array(row.contentZstd); + let contentDictName = this._asNullableString(row.contentDictName); + if (contentBytes === null || contentBytes.byteLength <= 0) { + const glossaryJson = this._asString(row.glossaryJson); + contentBytes = encodeRawTermContentBinary( + this._asString(row.rules), + this._asString(row.definitionTags), + this._asString(row.termTags), + this._textEncoder.encode(glossaryJson), + this._textEncoder, + ); + contentDictName = RAW_TERM_CONTENT_DICT_NAME; + } + chunks.push(contentBytes); + externalizedRows.push({id, contentDictName}); + } + if (chunks.length > 0) { + const spans = await this._termContentStore.appendBatch(chunks); + let spanIndex = 0; + for (const {id, contentDictName} of externalizedRows) { + const span = spans[spanIndex++]; + db.exec({ + sql: ` + UPDATE termEntryContent + SET + contentOffset = $contentOffset, + contentLength = $contentLength, + contentDictName = $contentDictName, + contentZstd = NULL + WHERE id = $id + `, + bind: { + $contentOffset: span.offset, + $contentLength: span.length, + $contentDictName: contentDictName, + $id: id, + }, + }); + } + } + } + + db.exec(` + UPDATE terms + SET + entryContentOffset = ( + SELECT c.contentOffset + FROM termEntryContent c + WHERE c.id = terms.entryContentId + LIMIT 1 + ), + entryContentLength = ( + SELECT c.contentLength + FROM termEntryContent c + WHERE c.id = terms.entryContentId + LIMIT 1 + ), + entryContentDictName = ( + SELECT c.contentDictName + FROM termEntryContent c + WHERE c.id = terms.entryContentId + LIMIT 1 + ) + WHERE + entryContentId IS NOT NULL AND + (entryContentOffset IS NULL OR entryContentOffset < 0 OR entryContentLength IS NULL OR entryContentLength <= 0) + `); + } + + /** + * Best effort cleanup for old IndexedDB storage from pre-sqlite builds. + */ + async _deleteLegacyIndexedDb() { + if (typeof indexedDB === 'undefined') { + return; + } + await new Promise((resolve) => { + try { + const request = indexedDB.deleteDatabase('dict'); + request.onsuccess = () => resolve(void 0); + request.onerror = () => resolve(void 0); + request.onblocked = () => resolve(void 0); + } catch (_) { + resolve(void 0); + } + }); + } + + /** + * @returns {import('@sqlite.org/sqlite-wasm').Database} + * @throws {Error} + */ + _requireDb() { + if (this._db === null) { + throw new Error(this._isOpening ? 'Database not ready' : 'Database not open'); + } + return this._db; + } + + /** + * @returns {import('@sqlite.org/sqlite-wasm').Sqlite3Static} + * @throws {Error} + */ + _requireSqlite3() { + if (this._sqlite3 === null) { + throw new Error('sqlite3 module is not initialized'); + } + return this._sqlite3; + } + + /** + * @template {import('dictionary-database').ObjectStoreName} T + * @param {T} objectStoreName + * @returns {InsertStatement} + * @throws {Error} + */ + _getInsertStatement(objectStoreName) { + switch (objectStoreName) { + case 'dictionaries': + return { + sql: 'INSERT INTO dictionaries(title, version, summaryJson) VALUES($title, $version, $summaryJson)', + bind: (item) => { + const summary = /** @type {import('dictionary-importer').Summary} */ (item); + return { + $title: summary.title, + $version: summary.version, + $summaryJson: JSON.stringify(summary), + }; + }, + }; + case 'terms': + throw new Error('terms uses external virtual storage; use bulkAdd'); + case 'termMeta': + return { + sql: 'INSERT INTO termMeta(dictionary, expression, mode, dataJson) VALUES($dictionary, $expression, $mode, $dataJson)', + bind: (item) => { + const row = /** @type {import('dictionary-database').DatabaseTermMeta} */ (item); + return { + $dictionary: row.dictionary, + $expression: row.expression, + $mode: row.mode, + $dataJson: JSON.stringify(row.data), + }; + }, + }; + case 'kanji': + return { + sql: 'INSERT INTO kanji(dictionary, character, onyomi, kunyomi, tags, meaningsJson, statsJson) VALUES($dictionary, $character, $onyomi, $kunyomi, $tags, $meaningsJson, $statsJson)', + bind: (item) => { + const row = /** @type {import('dictionary-database').DatabaseKanjiEntry} */ (item); + return { + $dictionary: row.dictionary, + $character: row.character, + $onyomi: row.onyomi, + $kunyomi: row.kunyomi, + $tags: row.tags, + $meaningsJson: JSON.stringify(row.meanings), + $statsJson: row.stats ? JSON.stringify(row.stats) : null, + }; + }, + }; + case 'kanjiMeta': + return { + sql: 'INSERT INTO kanjiMeta(dictionary, character, mode, dataJson) VALUES($dictionary, $character, $mode, $dataJson)', + bind: (item) => { + const row = /** @type {import('dictionary-database').DatabaseKanjiMeta} */ (item); + return { + $dictionary: row.dictionary, + $character: row.character, + $mode: row.mode, + $dataJson: JSON.stringify(row.data), + }; + }, + }; + case 'tagMeta': + return { + sql: 'INSERT INTO tagMeta(dictionary, name, category, ord, notes, score) VALUES($dictionary, $name, $category, $ord, $notes, $score)', + bind: (item) => { + const row = /** @type {import('dictionary-database').Tag} */ (item); + return { + $dictionary: row.dictionary, + $name: row.name, + $category: row.category, + $ord: row.order, + $notes: row.notes, + $score: row.score, + }; + }, + }; + case 'media': + return { + sql: 'INSERT INTO media(dictionary, path, mediaType, width, height, content) VALUES($dictionary, $path, $mediaType, $width, $height, $content)', + bind: (item) => { + const row = /** @type {import('dictionary-database').MediaDataArrayBufferContent} */ (item); + return { + $dictionary: row.dictionary, + $path: row.path, + $mediaType: row.mediaType, + $width: row.width, + $height: row.height, + $content: row.content, + }; + }, + }; + default: + throw new Error(`Unsupported object store: ${objectStoreName}`); + } + } + + /** */ + _clearTermsVtabCursorState() { + this._termsVtabCursorState.clear(); + } + + /** + * @throws {Error} + */ + _registerTermsVirtualTableModule() { + if (this._termsVtabModuleRegistered) { + return; + } + const sqlite3 = this._requireSqlite3(); + const db = this._requireDb(); + const dbPointer = db.pointer; + if (typeof dbPointer !== 'number') { + throw new Error('sqlite database pointer is unavailable'); + } + if (typeof sqlite3.vtab === 'undefined') { + throw new Error('sqlite vtab API is unavailable'); + } + const {capi, vtab} = sqlite3; + const termsVtabIdxDictionaryEq = 1 << 0; + const termsVtabIdxExpressionEq = 1 << 1; + const termsVtabIdxReadingEq = 1 << 2; + const termsVtabIdxSequenceEq = 1 << 3; + const termsVtabIdxRowIdEq = 1 << 4; + const termRecordStore = this._termRecordStore; + const termsVtabCursorState = this._termsVtabCursorState; + const asNumber = this._asNumber.bind(this); + const asString = this._asString.bind(this); + const eqOp = typeof capi.SQLITE_INDEX_CONSTRAINT_EQ === 'number' ? capi.SQLITE_INDEX_CONSTRAINT_EQ : 2; + const toPtr = (value) => this._asNumber(value, 0); + const schema = ` + CREATE TABLE x( + dictionary TEXT, + expression TEXT, + reading TEXT, + expressionReverse TEXT, + readingReverse TEXT, + entryContentId INTEGER, + entryContentOffset INTEGER, + entryContentLength INTEGER, + entryContentDictName TEXT, + definitionTags TEXT, + termTags TEXT, + rules TEXT, + score INTEGER, + glossaryJson TEXT, + sequence INTEGER + ) + `; + + // sqlite wasm vtab helpers expose dynamic struct wrappers that are not strongly typed in our jsdoc surface. + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const module = vtab.setupModule({ + catchExceptions: true, + methods: { + xCreate(pDb, _pAux, _argc, _argv, ppVtab) { + const rc = capi.sqlite3_declare_vtab(toPtr(pDb), schema); + if (rc !== 0) { return rc; } + vtab.xVtab.create(toPtr(ppVtab)); + return 0; + }, + xConnect(pDb, pAux, argc, argv, ppVtab) { + const rc = capi.sqlite3_declare_vtab(toPtr(pDb), schema); + if (rc !== 0) { return rc; } + vtab.xVtab.create(toPtr(ppVtab)); + return 0; + }, + xBestIndex(_pVtab, pIdxInfo) { + const idxInfo = vtab.xIndexInfo(toPtr(pIdxInfo)); + let argvIndex = 1; + let idxNum = 0; + for (let i = 0; i < idxInfo.$nConstraint; ++i) { + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const constraint = idxInfo.nthConstraint(i); + if (!constraint || constraint.$usable === 0 || constraint.$op !== eqOp) { continue; } + const column = toPtr(constraint.$iColumn); + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const usage = idxInfo.nthConstraintUsage(i); + if (!usage) { continue; } + switch (column) { + case -1: + idxNum |= termsVtabIdxRowIdEq; + break; + case 0: + idxNum |= termsVtabIdxDictionaryEq; + break; + case 1: + idxNum |= termsVtabIdxExpressionEq; + break; + case 2: + idxNum |= termsVtabIdxReadingEq; + break; + case 14: + idxNum |= termsVtabIdxSequenceEq; + break; + default: + continue; + } + usage.$argvIndex = argvIndex++; + usage.$omit = 1; + } + idxInfo.$idxNum = idxNum; + idxInfo.$estimatedRows = idxNum === 0 ? Math.max(1, termRecordStore.size) : 32; + idxInfo.$estimatedCost = idxNum === 0 ? Math.max(1, termRecordStore.size) : 32; + return 0; + }, + xDisconnect(pVtab) { + vtab.xVtab.dispose(toPtr(pVtab)); + return 0; + }, + xDestroy(pVtab) { + vtab.xVtab.dispose(toPtr(pVtab)); + return 0; + }, + xOpen(_pVtab, ppCursor) { + const cursor = vtab.xCursor.create(toPtr(ppCursor)); + termsVtabCursorState.set(cursor.pointer, {ids: [], index: 0}); + return 0; + }, + xClose(pCursor) { + const cursorPtr = toPtr(pCursor); + termsVtabCursorState.delete(cursorPtr); + vtab.xCursor.dispose(cursorPtr); + return 0; + }, + xFilter(pCursor, idxNum, _idxStr, argc, argv) { + const cursorPtr = toPtr(pCursor); + const state = termsVtabCursorState.get(cursorPtr); + if (typeof state === 'undefined') { return 0; } + const args = capi.sqlite3_values_to_js(toPtr(argc), toPtr(argv)); + let argIndex = 0; + let rowId = null; + let dictionary = null; + let expression = null; + let reading = null; + let sequence = null; + const idxBits = toPtr(idxNum); + if ((idxBits & termsVtabIdxRowIdEq) !== 0) { rowId = asNumber(args[argIndex++], -1); } + if ((idxBits & termsVtabIdxDictionaryEq) !== 0) { dictionary = asString(args[argIndex++]); } + if ((idxBits & termsVtabIdxExpressionEq) !== 0) { expression = asString(args[argIndex++]); } + if ((idxBits & termsVtabIdxReadingEq) !== 0) { reading = asString(args[argIndex++]); } + if ((idxBits & termsVtabIdxSequenceEq) !== 0) { sequence = asNumber(args[argIndex++], -1); } + + const baseIds = (typeof rowId === 'number' && rowId > 0) ? [rowId] : termRecordStore.getAllIds(); + const ids = []; + for (const id of baseIds) { + if (id <= 0) { continue; } + const record = termRecordStore.getById(id); + if (typeof record === 'undefined') { continue; } + if (dictionary !== null && record.dictionary !== dictionary) { continue; } + if (expression !== null && record.expression !== expression) { continue; } + if (reading !== null && record.reading !== reading) { continue; } + if (sequence !== null && (record.sequence ?? -1) !== sequence) { continue; } + ids.push(id); + } + state.ids = ids; + state.index = 0; + return 0; + }, + xNext(pCursor) { + const state = termsVtabCursorState.get(toPtr(pCursor)); + if (typeof state !== 'undefined') { + ++state.index; + } + return 0; + }, + xEof(pCursor) { + const state = termsVtabCursorState.get(toPtr(pCursor)); + return (typeof state === 'undefined' || state.index >= state.ids.length) ? 1 : 0; + }, + xColumn(pCursor, pContext, column) { + const state = termsVtabCursorState.get(toPtr(pCursor)); + if (typeof state === 'undefined' || state.index >= state.ids.length) { + capi.sqlite3_result_null(toPtr(pContext)); + return 0; + } + const id = state.ids[state.index]; + const record = termRecordStore.getById(id); + if (typeof record === 'undefined') { + capi.sqlite3_result_null(toPtr(pContext)); + return 0; + } + let value = null; + switch (toPtr(column)) { + case 0: value = record.dictionary; break; + case 1: value = record.expression; break; + case 2: value = record.reading; break; + case 3: value = record.expressionReverse; break; + case 4: value = record.readingReverse; break; + case 5: value = null; break; + case 6: value = record.entryContentOffset; break; + case 7: value = record.entryContentLength; break; + case 8: value = record.entryContentDictName; break; + case 9: value = ''; break; + case 10: value = ''; break; + case 11: value = ''; break; + case 12: value = record.score; break; + case 13: value = '[]'; break; + case 14: value = record.sequence; break; + default: value = null; break; + } + capi.sqlite3_result_js(toPtr(pContext), value); + return 0; + }, + xRowid(pCursor, ppRowId) { + const state = termsVtabCursorState.get(toPtr(pCursor)); + const id = (typeof state === 'undefined' || state.index >= state.ids.length) ? 0 : state.ids[state.index]; + vtab.xRowid(toPtr(ppRowId), id); + return 0; + }, + xUpdate() { + return capi.SQLITE_READONLY; + }, + }, + }); + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const typedModule = /** @type {import('@sqlite.org/sqlite-wasm').sqlite3_module} */ (module); + this._termsVtabModule = typedModule; + const rc = capi.sqlite3_create_module(dbPointer, 'yomitan_terms', typedModule, 0); + if (rc !== 0) { + throw new Error(`Failed to register yomitan_terms module: rc=${rc}`); + } + this._termsVtabModuleRegistered = true; + } + + /** + * @param {string} whereClause + * @returns {string} + */ + _createTermSelectSql(whereClause) { + return ` + SELECT + t.* + FROM terms t + WHERE ${whereClause} + `; + } + + /** + * @param {Iterable} ids + * @returns {Promise>} + */ + async _fetchTermRowsByIds(ids) { + await this._termContentStore.ensureLoadedForRead(); + /** @type {Map} */ + const rowsById = new Map(); + const recordsById = this._termRecordStore.getByIds(ids); + for (const [id, record] of recordsById) { + const row = await this._deserializeTermRow({ + id, + dictionary: record.dictionary, + expression: record.expression, + reading: record.reading, + expressionReverse: record.expressionReverse, + readingReverse: record.readingReverse, + entryContentId: null, + entryContentOffset: record.entryContentOffset, + entryContentLength: record.entryContentLength, + entryContentDictName: record.entryContentDictName, + definitionTags: '', + termTags: '', + rules: '', + score: record.score, + glossaryJson: '[]', + sequence: record.sequence, + }); + rowsById.set(id, row); + } + return rowsById; + } + + /** + * @param {import('core').SafeAny} row + * @returns {Promise} + */ + async _deserializeTermRow(row) { + const entryContentId = this._asNullableNumber(row.entryContentId); + const contentOffset = this._asNumber(row.entryContentOffset, -1); + const contentLength = this._asNumber(row.entryContentLength, -1); + const contentDictName = this._asNullableString(row.entryContentDictName) ?? ''; + const hasExternalContentSpan = contentOffset >= 0 && contentLength > 0; + const cacheKey = hasExternalContentSpan ? + `span:${contentOffset}:${contentLength}:${contentDictName}` : + (typeof entryContentId === 'number' && entryContentId > 0 ? `id:${entryContentId}` : ''); + /** @type {string|null} */ + let definitionTags; + /** @type {string|undefined} */ + let termTags; + /** @type {string} */ + let rules; + /** @type {import('dictionary-data').TermGlossary[]} */ + let glossary; + /** @type {(() => import('dictionary-data').TermGlossary[])|null} */ + let glossaryResolver = null; + + if (cacheKey.length > 0) { + let cached = this._getCachedTermEntryContent(cacheKey); + if (typeof cached === 'undefined') { + /** @type {Uint8Array|null} */ + let contentBytes = null; + if (contentOffset >= 0 && contentLength > 0) { + try { + contentBytes = await this._termContentStore.readSlice(contentOffset, contentLength); + } catch (e) { + logTermContentZstdError(e); + contentBytes = null; + } + } + if (contentBytes !== null && contentBytes.length > 0) { + try { + const rawSharedGlossaryHeader = ( + contentDictName === RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME || + isRawTermContentSharedGlossaryBinary(contentBytes) + ) ? + decodeRawTermContentSharedGlossaryHeader(contentBytes, this._textDecoder) : + null; + if (rawSharedGlossaryHeader !== null) { + definitionTags = this._asNullableString(rawSharedGlossaryHeader.definitionTags) ?? null; + termTags = this._asNullableString(rawSharedGlossaryHeader.termTags); + rules = this._asString(rawSharedGlossaryHeader.rules); + const rawGlossaryJsonBytes = contentDictName === RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME ? + await this._readCompressedSharedGlossarySlice( + this._asString(row.dictionary), + rawSharedGlossaryHeader.glossaryOffset, + rawSharedGlossaryHeader.glossaryLength, + ) : + await this._termContentStore.readSlice( + rawSharedGlossaryHeader.glossaryOffset, + rawSharedGlossaryHeader.glossaryLength, + ); + const glossaryJson = this._textDecoder.decode(rawGlossaryJsonBytes); + glossary = this._safeParseJson(glossaryJson, []); + cached = { + definitionTags, + termTags, + rules, + glossaryJson, + glossary: Array.isArray(glossary) ? glossary : [], + }; + } else { + const rawContentHeader = ( + contentDictName === RAW_TERM_CONTENT_DICT_NAME || + isRawTermContentBinary(contentBytes) + ) ? + decodeRawTermContentHeader(contentBytes, this._textDecoder) : + null; + if (rawContentHeader !== null) { + definitionTags = this._asNullableString(rawContentHeader.definitionTags) ?? null; + termTags = this._asNullableString(rawContentHeader.termTags); + rules = this._asString(rawContentHeader.rules); + const rawGlossaryJsonBytes = getRawTermContentGlossaryJsonBytes( + contentBytes, + rawContentHeader.glossaryJsonOffset, + rawContentHeader.glossaryJsonLength, + ); + const glossaryJson = this._textDecoder.decode(rawGlossaryJsonBytes); + glossary = this._safeParseJson(glossaryJson, []); + cached = { + definitionTags, + termTags, + rules, + glossaryJson, + glossary: Array.isArray(glossary) ? glossary : [], + }; + } else { + const contentJson = (contentDictName === 'raw') ? + this._textDecoder.decode(contentBytes) : + this._textDecoder.decode(decompressTermContentZstd(contentBytes, contentDictName.length > 0 ? contentDictName : null)); + const parsedHeader = this._parseSerializedTermEntryContentHeader(contentJson); + if (parsedHeader !== null) { + definitionTags = parsedHeader.definitionTags; + termTags = parsedHeader.termTags; + rules = parsedHeader.rules; + glossary = this._safeParseJson(parsedHeader.glossaryJson, []); + cached = { + definitionTags, + termTags, + rules, + glossaryJson: parsedHeader.glossaryJson, + glossary: Array.isArray(glossary) ? glossary : [], + }; + } else { + const content = /** @type {{rules?: string, definitionTags?: string, termTags?: string, glossary?: import('dictionary-data').TermGlossary[]}} */ ( + this._safeParseJson(contentJson, {}) + ); + definitionTags = this._asNullableString(content.definitionTags) ?? null; + termTags = this._asNullableString(content.termTags); + rules = this._asString(content.rules); + glossary = Array.isArray(content.glossary) ? content.glossary : []; + cached = { + definitionTags, + termTags, + rules, + glossaryJson: JSON.stringify(glossary), + glossary, + }; + } + } + } + } catch (e) { + logTermContentZstdError(e); + definitionTags = null; + termTags = ''; + rules = ''; + glossary = []; + cached = { + definitionTags, + termTags, + rules, + glossaryJson: '[]', + glossary, + }; + } + } else { + definitionTags = null; + termTags = ''; + rules = ''; + glossary = []; + cached = { + definitionTags, + termTags, + rules, + glossaryJson: '[]', + glossary, + }; + } + this._setCachedTermEntryContent(cacheKey, cached); + } + definitionTags = cached.definitionTags; + termTags = cached.termTags; + rules = cached.rules; + if (Array.isArray(cached.glossary)) { + glossary = cached.glossary; + } else { + glossary = []; + glossaryResolver = () => this._resolveCachedTermEntryGlossary(cached); + } + } else { + definitionTags = this._asNullableString(row.definitionTags) ?? null; + termTags = this._asNullableString(row.termTags); + rules = this._asString(row.rules); + glossary = this._safeParseJson(this._asString(row.glossaryJson), []); + } + const termEntry = { + id: this._asNumber(row.id, -1), + expression: this._asString(row.expression), + reading: this._asString(row.reading), + expressionReverse: this._asNullableString(row.expressionReverse), + readingReverse: this._asNullableString(row.readingReverse), + definitionTags, + rules, + score: this._asNumber(row.score, 0), + glossary, + sequence: this._asNullableNumber(row.sequence), + termTags, + dictionary: this._asString(row.dictionary), + }; + if (glossaryResolver !== null) { + Object.defineProperty(termEntry, 'glossary', { + enumerable: true, + configurable: true, + get: () => { + const resolvedGlossary = glossaryResolver(); + Object.defineProperty(termEntry, 'glossary', { + enumerable: true, + configurable: true, + writable: true, + value: resolvedGlossary, + }); + return resolvedGlossary; + }, + set: (value) => { + Object.defineProperty(termEntry, 'glossary', { + enumerable: true, + configurable: true, + writable: true, + value: Array.isArray(value) ? value : [], + }); + }, + }); + } + return termEntry; + } + + /** + * @param {string} cacheKey + * @returns {{definitionTags: string|null, termTags: string|undefined, rules: string, glossaryJson?: string, glossary?: import('dictionary-data').TermGlossary[]}|undefined} + */ + _getCachedTermEntryContent(cacheKey) { + const cached = this._termEntryContentCache.get(cacheKey); + if (typeof cached === 'undefined') { + return void 0; + } + // Promote recently used entries. + this._termEntryContentCache.delete(cacheKey); + this._termEntryContentCache.set(cacheKey, cached); + return cached; + } + + /** + * @param {string} cacheKey + * @param {{definitionTags: string|null, termTags: string|undefined, rules: string, glossaryJson?: string, glossary?: import('dictionary-data').TermGlossary[]}} value + */ + _setCachedTermEntryContent(cacheKey, value) { + if (this._termEntryContentCache.has(cacheKey)) { + this._termEntryContentCache.delete(cacheKey); + } + this._termEntryContentCache.set(cacheKey, value); + while (this._termEntryContentCache.size > this._termEntryContentCacheMaxEntries) { + const oldestKey = this._termEntryContentCache.keys().next().value; + if (typeof oldestKey !== 'string') { break; } + this._termEntryContentCache.delete(oldestKey); + } + } + + /** + * @param {string} cacheKey + * @param {boolean} present + */ + _setTermExactPresenceCached(cacheKey, present) { + if (this._termExactPresenceCache.has(cacheKey)) { + this._termExactPresenceCache.delete(cacheKey); + } + this._termExactPresenceCache.set(cacheKey, present); + while (this._termExactPresenceCache.size > this._termExactPresenceCacheMaxEntries) { + const oldestKey = this._termExactPresenceCache.keys().next().value; + if (typeof oldestKey !== 'string') { break; } + this._termExactPresenceCache.delete(oldestKey); + } + } + + /** + * @param {{glossaryJson?: string, glossary?: import('dictionary-data').TermGlossary[], definitionTags: string|null, termTags: string|undefined, rules: string}} cached + * @returns {import('dictionary-data').TermGlossary[]} + */ + _resolveCachedTermEntryGlossary(cached) { + if (Array.isArray(cached.glossary)) { + return cached.glossary; + } + const parsedGlossary = this._safeParseJson(typeof cached.glossaryJson === 'string' ? cached.glossaryJson : '[]', []); + cached.glossary = Array.isArray(parsedGlossary) ? parsedGlossary : []; + return cached.glossary; + } + + /** + * @param {string} value + * @param {number} startIndex + * @returns {{token: string, endIndex: number}|null} + */ + _readJsonStringToken(value, startIndex) { + if (startIndex < 0 || startIndex >= value.length || value[startIndex] !== '"') { + return null; + } + let i = startIndex + 1; + const ii = value.length; + while (i < ii) { + const c = value[i]; + if (c === '\\') { + i += 2; + continue; + } + if (c === '"') { + return { + token: value.slice(startIndex, i + 1), + endIndex: i + 1, + }; + } + ++i; + } + return null; + } + + /** + * @param {string} contentJson + * @returns {{rules: string, definitionTags: string|null, termTags: string|undefined, glossaryJson: string}|null} + */ + _parseSerializedTermEntryContentHeader(contentJson) { + const prefixRules = '{"rules":'; + const prefixDefinitionTags = ',"definitionTags":'; + const prefixTermTags = ',"termTags":'; + const prefixGlossary = ',"glossary":'; + if (!contentJson.startsWith(prefixRules) || !contentJson.endsWith('}')) { + return null; + } + + let index = prefixRules.length; + const rulesToken = this._readJsonStringToken(contentJson, index); + if (rulesToken === null) { return null; } + index = rulesToken.endIndex; + if (!contentJson.startsWith(prefixDefinitionTags, index)) { return null; } + index += prefixDefinitionTags.length; + + const definitionTagsToken = this._readJsonStringToken(contentJson, index); + if (definitionTagsToken === null) { return null; } + index = definitionTagsToken.endIndex; + if (!contentJson.startsWith(prefixTermTags, index)) { return null; } + index += prefixTermTags.length; + + const termTagsToken = this._readJsonStringToken(contentJson, index); + if (termTagsToken === null) { return null; } + index = termTagsToken.endIndex; + if (!contentJson.startsWith(prefixGlossary, index)) { return null; } + index += prefixGlossary.length; + if (index > contentJson.length - 1) { return null; } + + const glossaryJson = contentJson.slice(index, -1); + return { + rules: /** @type {string} */ (this._safeParseJson(rulesToken.token, '')), + definitionTags: this._asNullableString(this._safeParseJson(definitionTagsToken.token, '')) ?? null, + termTags: this._asNullableString(this._safeParseJson(termTagsToken.token, '')), + glossaryJson, + }; + } + + /** + * @param {import('core').SafeAny} row + * @returns {import('dictionary-database').DatabaseTermMeta} + * @throws {Error} + */ + _deserializeTermMetaRow(row) { + const expression = this._asString(row.expression); + const dictionary = this._asString(row.dictionary); + const mode = this._asString(row.mode); + const data = /** @type {unknown} */ (this._safeParseJson(this._asString(row.dataJson), null)); + switch (mode) { + case 'freq': + return { + expression, + mode: 'freq', + data: /** @type {import('dictionary-data').GenericFrequencyData | import('dictionary-data').TermMetaFrequencyDataWithReading} */ (data), + dictionary, + }; + case 'pitch': + return { + expression, + mode: 'pitch', + data: /** @type {import('dictionary-data').TermMetaPitchData} */ (data), + dictionary, + }; + case 'ipa': + return { + expression, + mode: 'ipa', + data: /** @type {import('dictionary-data').TermMetaPhoneticData} */ (data), + dictionary, + }; + default: + throw new Error(`Unknown mode: ${mode}`); + } + } + + /** + * @param {import('core').SafeAny} row + * @returns {import('dictionary-database').DatabaseKanjiEntry} + */ + _deserializeKanjiRow(row) { + return { + character: this._asString(row.character), + onyomi: this._asString(row.onyomi), + kunyomi: this._asString(row.kunyomi), + tags: this._asString(row.tags), + meanings: this._safeParseJson(this._asString(row.meaningsJson), []), + dictionary: this._asString(row.dictionary), + stats: this._safeParseJson(this._asNullableString(row.statsJson) ?? '{}', {}), + }; + } + + /** + * @param {import('core').SafeAny} row + * @returns {import('dictionary-database').DatabaseKanjiMeta} + * @throws {Error} + */ + _deserializeKanjiMetaRow(row) { + const character = this._asString(row.character); + const dictionary = this._asString(row.dictionary); + const mode = this._asString(row.mode); + const data = /** @type {unknown} */ (this._safeParseJson(this._asString(row.dataJson), null)); + if (mode !== 'freq') { + throw new Error(`Unknown mode: ${mode}`); + } + return { + character, + mode: 'freq', + data: /** @type {import('dictionary-data').GenericFrequencyData} */ (data), + dictionary, + }; } /** - * @param {import('dictionary-database').TermExactRequest[]} termList - * @param {import('dictionary-database').DictionarySet} dictionaries - * @returns {Promise} + * @param {import('core').SafeAny} row + * @returns {import('dictionary-database').Tag} */ - findTermsExactBulk(termList, dictionaries) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row, item) => (row.reading === item.reading && dictionaries.has(row.dictionary)); - return this._findMultiBulk('terms', ['expression'], termList, this._createOnlyQuery3, predicate, this._createTermBind1); + _deserializeTagRow(row) { + return { + name: this._asString(row.name), + category: this._asString(row.category), + order: this._asNumber(row.order, 0), + notes: this._asString(row.notes), + score: this._asNumber(row.score, 0), + dictionary: this._asString(row.dictionary), + }; } /** - * @param {import('dictionary-database').DictionaryAndQueryRequest[]} items - * @returns {Promise} + * @param {import('core').SafeAny} row + * @returns {import('dictionary-database').MediaDataArrayBufferContent} */ - findTermsBySequenceBulk(items) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row, item) => (row.dictionary === item.dictionary); - return this._findMultiBulk('terms', ['sequence'], items, this._createOnlyQuery2, predicate, this._createTermBind2); + _deserializeMediaRow(row) { + return { + dictionary: this._asString(row.dictionary), + path: this._asString(row.path), + mediaType: this._asString(row.mediaType), + width: this._asNumber(row.width, 0), + height: this._asNumber(row.height, 0), + content: this._toArrayBuffer(row.content), + }; } /** - * @param {string[]} termList - * @param {import('dictionary-database').DictionarySet} dictionaries - * @returns {Promise} + * @param {unknown} field + * @returns {string[]} */ - findTermMetaBulk(termList, dictionaries) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row) => dictionaries.has(row.dictionary); - return this._findMultiBulk('termMeta', ['expression'], termList, this._createOnlyQuery1, predicate, this._createTermMetaBind); + _splitField(field) { + return typeof field === 'string' && field.length > 0 ? field.split(' ') : []; } /** - * @param {string[]} kanjiList - * @param {import('dictionary-database').DictionarySet} dictionaries - * @returns {Promise} + * @param {unknown} value + * @returns {ArrayBuffer} */ - findKanjiBulk(kanjiList, dictionaries) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row) => dictionaries.has(row.dictionary); - return this._findMultiBulk('kanji', ['character'], kanjiList, this._createOnlyQuery1, predicate, this._createKanjiBind); + _toArrayBuffer(value) { + if (value instanceof ArrayBuffer) { + return value; + } + if (value instanceof Uint8Array) { + return value.buffer.slice(value.byteOffset, value.byteOffset + value.byteLength); + } + return new ArrayBuffer(0); } /** - * @param {string[]} kanjiList - * @param {import('dictionary-database').DictionarySet} dictionaries - * @returns {Promise} + * @param {unknown} value + * @returns {Uint8Array|null} */ - findKanjiMetaBulk(kanjiList, dictionaries) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row) => dictionaries.has(row.dictionary); - return this._findMultiBulk('kanjiMeta', ['character'], kanjiList, this._createOnlyQuery1, predicate, this._createKanjiMetaBind); + _toUint8Array(value) { + if (value instanceof Uint8Array) { + return value; + } + if (value instanceof ArrayBuffer) { + return new Uint8Array(value); + } + return null; } /** - * @param {import('dictionary-database').DictionaryAndQueryRequest[]} items - * @returns {Promise<(import('dictionary-database').Tag|undefined)[]>} + * @param {Uint8Array} a + * @param {Uint8Array} b + * @returns {boolean} */ - findTagMetaBulk(items) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row, item) => (row.dictionary === item.dictionary); - return this._findFirstBulk('tagMeta', 'name', items, this._createOnlyQuery2, predicate); + _areUint8ArraysEqual(a, b) { + if (a.byteLength !== b.byteLength) { + return false; + } + for (let i = 0, ii = a.byteLength; i < ii; ++i) { + if (a[i] !== b[i]) { + return false; + } + } + return true; } /** - * @param {string} name - * @param {string} dictionary - * @returns {Promise} + * @returns {number} */ - findTagForTitle(name, dictionary) { - const query = IDBKeyRange.only(name); - return this._db.find('tagMeta', 'name', query, (row) => (/** @type {import('dictionary-database').Tag} */ (row).dictionary === dictionary), null, null); + _computeStatementCacheMaxEntries() { + const memoryGiB = this._getApproximateDeviceMemoryGiB(); + return memoryGiB !== null && memoryGiB <= 4 ? LOW_MEMORY_STATEMENT_CACHE_MAX_ENTRIES : DEFAULT_STATEMENT_CACHE_MAX_ENTRIES; } /** - * @param {import('dictionary-database').MediaRequest[]} items - * @returns {Promise} + * @returns {number} */ - getMedia(items) { - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row, item) => (row.dictionary === item.dictionary); - return this._findMultiBulk('media', ['path'], items, this._createOnlyQuery4, predicate, this._createMediaBind); + _computeTermExactPresenceCacheMaxEntries() { + const memoryGiB = this._getApproximateDeviceMemoryGiB(); + return memoryGiB !== null && memoryGiB <= 4 ? LOW_MEMORY_TERM_EXACT_PRESENCE_CACHE_MAX_ENTRIES : DEFAULT_TERM_EXACT_PRESENCE_CACHE_MAX_ENTRIES; } /** - * @param {import('dictionary-database').DrawMediaRequest[]} items - * @param {MessagePort} source + * @returns {number} */ - async drawMedia(items, source) { - if (this._worker !== null) { // if a worker is available, offload the work to it - this._worker.postMessage({action: 'drawMedia', params: {items}}, [source]); - return; + _computeDefaultTermBulkAddStagingMaxRows() { + const memoryGiB = this._getApproximateDeviceMemoryGiB(); + if (memoryGiB !== null && memoryGiB <= 4) { + return TERM_BULK_ADD_STAGING_MAX_ROWS; } - // otherwise, you are the worker, so do the work - safePerformance.mark('drawMedia:start'); - - // merge items with the same path to reduce the number of database queries. collects the canvases into a single array for each path. - /** @type {Map} */ - const groupedItems = new Map(); - for (const item of items) { - const {path, dictionary, canvasIndex, canvasWidth, canvasHeight, generation} = item; - const key = `${path}:::${dictionary}`; - if (!groupedItems.has(key)) { - groupedItems.set(key, {path, dictionary, canvasIndexes: [], canvasWidth, canvasHeight, generation}); - } - groupedItems.get(key)?.canvasIndexes.push(canvasIndex); + if (memoryGiB !== null && memoryGiB >= 8) { + return HIGH_MEMORY_TERM_BULK_ADD_STAGING_MAX_ROWS; } - const groupedItemsArray = [...groupedItems.values()]; - - /** @type {import('dictionary-database').FindPredicate} */ - const predicate = (row, item) => (row.dictionary === item.dictionary); - const results = await this._findMultiBulk('media', ['path'], groupedItemsArray, this._createOnlyQuery5, predicate, this._createDrawMediaBind); - - // move all svgs to front to have a hotter loop - results.sort((a, _b) => (a.mediaType === 'image/svg+xml' ? -1 : 1)); - - safePerformance.mark('drawMedia:draw:start'); - for (const m of results) { - if (m.mediaType === 'image/svg+xml') { - safePerformance.mark('drawMedia:draw:svg:start'); - /** @type {import('@resvg/resvg-wasm').ResvgRenderOptions} */ - const opts = { - fitTo: { - mode: 'width', - value: m.canvasWidth, - }, - font: { - fontBuffers: this._resvgFontBuffer !== null ? [this._resvgFontBuffer] : [], - }, - }; - const resvgJS = new Resvg(new Uint8Array(m.content), opts); - const render = resvgJS.render(); - source.postMessage({action: 'drawBufferToCanvases', params: {buffer: render.pixels.buffer, width: render.width, height: render.height, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [render.pixels.buffer]); - safePerformance.mark('drawMedia:draw:svg:end'); - safePerformance.measure('drawMedia:draw:svg', 'drawMedia:draw:svg:start', 'drawMedia:draw:svg:end'); - } else { - safePerformance.mark('drawMedia:draw:raster:start'); + return DEFAULT_TERM_BULK_ADD_STAGING_MAX_ROWS; + } - // ImageDecoder is slightly faster than Blob/createImageBitmap, but - // 1) it is not available in Firefox <133 - // 2) it is available in Firefox >=133, but it's not possible to transfer VideoFrames cross-process - // - // So the second branch is a fallback for all versions of Firefox and doesn't use ImageDecoder at all - // The second branch can eventually be changed to use ImageDecoder when we are okay with dropping support for Firefox <133 - // The branches can be unified entirely when Firefox implements support for transferring VideoFrames cross-process in postMessage - if ('serviceWorker' in navigator) { // this is just a check for chrome, we don't actually use service worker functionality here - const imageDecoder = new ImageDecoder({type: m.mediaType, data: m.content}); - await imageDecoder.decode().then((decodedImageResult) => { - source.postMessage({action: 'drawDecodedImageToCanvases', params: {decodedImage: decodedImageResult.image, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [decodedImageResult.image]); - }); - } else { - const image = new Blob([m.content], {type: m.mediaType}); - await createImageBitmap(image, {resizeWidth: m.canvasWidth, resizeHeight: m.canvasHeight, resizeQuality: 'high'}).then((decodedImage) => { - // we need to do a dumb hack where we convert this ImageBitmap to an ImageData by drawing it to a temporary canvas, because Firefox doesn't support transferring ImageBitmaps cross-process - const canvas = new OffscreenCanvas(decodedImage.width, decodedImage.height); - const ctx = canvas.getContext('2d'); - if (ctx !== null) { - ctx.drawImage(decodedImage, 0, 0); - const imageData = ctx.getImageData(0, 0, decodedImage.width, decodedImage.height); - source.postMessage({action: 'drawBufferToCanvases', params: {buffer: imageData.data.buffer, width: decodedImage.width, height: decodedImage.height, canvasIndexes: m.canvasIndexes, generation: m.generation}}, [imageData.data.buffer]); - } - }); - } - safePerformance.mark('drawMedia:draw:raster:end'); - safePerformance.measure('drawMedia:draw:raster', 'drawMedia:draw:raster:start', 'drawMedia:draw:raster:end'); + /** + * @returns {number|null} + */ + _getApproximateDeviceMemoryGiB() { + try { + const rawValue = /** @type {unknown} */ (Reflect.get(globalThis.navigator ?? {}, 'deviceMemory')); + if (typeof rawValue === 'number' && Number.isFinite(rawValue) && rawValue > 0) { + return rawValue; } + } catch (_) { + // NOP } - safePerformance.mark('drawMedia:draw:end'); - safePerformance.measure('drawMedia:draw', 'drawMedia:draw:start', 'drawMedia:draw:end'); - - safePerformance.mark('drawMedia:end'); - safePerformance.measure('drawMedia', 'drawMedia:start', 'drawMedia:end'); + return null; } /** - * @returns {Promise} + * @param {number[]} values + * @returns {number} */ - getDictionaryInfo() { - return new Promise((resolve, reject) => { - const transaction = this._db.transaction(['dictionaries'], 'readonly'); - const objectStore = transaction.objectStore('dictionaries'); - this._db.getAll(objectStore, null, resolve, reject, null); - }); + _average(values) { + if (values.length === 0) { return 0; } + let total = 0; + for (const value of values) { + total += value; + } + return total / values.length; } /** - * @param {string[]} dictionaryNames - * @param {boolean} getTotal - * @returns {Promise} + * @param {number[]} values + * @returns {number} */ - getDictionaryCounts(dictionaryNames, getTotal) { - return new Promise((resolve, reject) => { - const targets = [ - ['kanji', 'dictionary'], - ['kanjiMeta', 'dictionary'], - ['terms', 'dictionary'], - ['termMeta', 'dictionary'], - ['tagMeta', 'dictionary'], - ['media', 'dictionary'], - ]; - const objectStoreNames = targets.map(([objectStoreName]) => objectStoreName); - const transaction = this._db.transaction(objectStoreNames, 'readonly'); - const databaseTargets = targets.map(([objectStoreName, indexName]) => { - const objectStore = transaction.objectStore(objectStoreName); - const index = objectStore.index(indexName); - return {objectStore, index}; - }); + _p95(values) { + if (values.length === 0) { return 0; } + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1)); + return sorted[index]; + } - /** @type {import('database').CountTarget[]} */ - const countTargets = []; - if (getTotal) { - for (const {objectStore} of databaseTargets) { - countTargets.push([objectStore, void 0]); - } - } - for (const dictionaryName of dictionaryNames) { - const query = IDBKeyRange.only(dictionaryName); - for (const {index} of databaseTargets) { - countTargets.push([index, query]); - } - } - - /** - * @param {number[]} results - */ - const onCountComplete = (results) => { - const resultCount = results.length; - const targetCount = targets.length; - /** @type {import('dictionary-database').DictionaryCountGroup[]} */ - const counts = []; - for (let i = 0; i < resultCount; i += targetCount) { - /** @type {import('dictionary-database').DictionaryCountGroup} */ - const countGroup = {}; - for (let j = 0; j < targetCount; ++j) { - countGroup[targets[j][0]] = results[i + j]; - } - counts.push(countGroup); - } - const total = getTotal ? /** @type {import('dictionary-database').DictionaryCountGroup} */ (counts.shift()) : null; - resolve({total, counts}); - }; + /** + * @param {unknown} value + * @param {number} defaultValue + * @returns {number} + */ + _asNumber(value, defaultValue = 0) { + if (typeof value === 'number') { return value; } + if (typeof value === 'bigint') { return Number(value); } + if (typeof value === 'string' && value.length > 0) { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : defaultValue; + } + return defaultValue; + } - this._db.bulkCount(countTargets, onCountComplete, reject); - }); + /** + * @param {unknown} value + * @returns {number|undefined} + */ + _asNullableNumber(value) { + if (value === null || typeof value === 'undefined') { + return void 0; + } + return this._asNumber(value, 0); } /** - * @param {string} title - * @returns {Promise} + * @param {unknown} value + * @returns {string} */ - async dictionaryExists(title) { - const query = IDBKeyRange.only(title); - const result = await this._db.find('dictionaries', 'title', query, null, null, void 0); - return typeof result !== 'undefined'; + _asString(value) { + if (typeof value === 'string') { + return value; + } + if (typeof value === 'number' || typeof value === 'bigint') { + return `${value}`; + } + return ''; } /** - * @template {import('dictionary-database').ObjectStoreName} T - * @param {T} objectStoreName - * @param {import('dictionary-database').ObjectStoreData[]} items - * @param {number} start - * @param {number} count - * @returns {Promise} + * @param {unknown} value + * @returns {string|undefined} */ - bulkAdd(objectStoreName, items, start, count) { - return this._db.bulkAdd(objectStoreName, items, start, count); + _asNullableString(value) { + if (value === null || typeof value === 'undefined') { + return void 0; + } + return this._asString(value); } /** - * @template {import('dictionary-database').ObjectStoreName} T - * @param {T} objectStoreName - * @param {import('dictionary-database').ObjectStoreData} item - * @returns {Promise>} + * @template [T=unknown] + * @param {string} value + * @param {T} fallback + * @returns {T} */ - addWithResult(objectStoreName, item) { - return this._db.addWithResult(objectStoreName, item); + _safeParseJson(value, fallback) { + try { + return /** @type {T} */ (parseJson(value)); + } catch (_) { + return fallback; + } } /** - * @template {import('dictionary-database').ObjectStoreName} T - * @param {T} objectStoreName - * @param {import('dictionary-database').DatabaseUpdateItem[]} items - * @param {number} start - * @param {number} count - * @returns {Promise} + * @param {string} rules + * @param {string} definitionTags + * @param {string} termTags + * @param {import('dictionary-data').TermGlossary[]} glossary + * @returns {string} */ - bulkUpdate(objectStoreName, items, start, count) { - return this._db.bulkUpdate(objectStoreName, items, start, count); + _serializeTermEntryContent(rules, definitionTags, termTags, glossary) { + return JSON.stringify({rules, definitionTags, termTags, glossary}); } - // Private + /** + * @param {import('dictionary-database').DatabaseTermEntry} row + * @returns {Uint8Array|null} + */ + _getRawTermContentBytesIfAvailable(row) { + const glossaryJsonBytes = row.termEntryContentRawGlossaryJsonBytes; + if (!(glossaryJsonBytes instanceof Uint8Array) || glossaryJsonBytes.byteLength === 0) { + return null; + } + const rules = row.rules ?? ''; + const definitionTags = row.definitionTags ?? row.tags ?? ''; + const termTags = row.termTags ?? ''; + const contentBytes = encodeRawTermContentBinary(rules, definitionTags, termTags, glossaryJsonBytes, this._textEncoder); + row.termEntryContentBytes = contentBytes; + row.termEntryContentRawGlossaryJsonBytes = void 0; + return contentBytes; + } /** - * @template [TRow=unknown] - * @template [TItem=unknown] - * @template [TResult=unknown] - * @param {import('dictionary-database').ObjectStoreName} objectStoreName - * @param {string[]} indexNames - * @param {TItem[]} items - * @param {import('dictionary-database').CreateQuery} createQuery - * @param {import('dictionary-database').FindPredicate} predicate - * @param {import('dictionary-database').CreateResult} createResult - * @returns {Promise} - */ - _findMultiBulk(objectStoreName, indexNames, items, createQuery, predicate, createResult) { - safePerformance.mark('findMultiBulk:start'); - return new Promise((resolve, reject) => { - const itemCount = items.length; - const indexCount = indexNames.length; - /** @type {TResult[]} */ - const results = []; - if (itemCount === 0 || indexCount === 0) { - resolve(results); - safePerformance.mark('findMultiBulk:end'); - safePerformance.measure('findMultiBulk', 'findMultiBulk:start', 'findMultiBulk:end'); - return; - } + * @param {string} contentJson + * @returns {string} + */ + _hashEntryContent(contentJson) { + let h1 = 0x811c9dc5; + let h2 = 0x9e3779b9; + const bytes = this._textEncoder.encode(contentJson); + for (let i = 0, ii = bytes.length; i < ii; ++i) { + const code = bytes[i]; + h1 = Math.imul((h1 ^ code) >>> 0, 0x01000193); + h2 = Math.imul((h2 ^ code) >>> 0, 0x85ebca6b); + h2 = (h2 ^ (h2 >>> 13)) >>> 0; + } + if ((h1 | h2) === 0) { + h1 = 1; + } + return `${(h1 >>> 0).toString(16).padStart(8, '0')}${(h2 >>> 0).toString(16).padStart(8, '0')}`; + } - const transaction = this._db.transaction([objectStoreName], 'readonly'); - const objectStore = transaction.objectStore(objectStoreName); - const indexList = []; - for (const indexName of indexNames) { - indexList.push(objectStore.index(indexName)); - } - let completeCount = 0; - const requiredCompleteCount = itemCount * indexCount; - /** - * @param {TItem} item - * @returns {(rows: TRow[], data: import('dictionary-database').FindMultiBulkData) => void} - */ - const onGetAll = (item) => (rows, data) => { - if (typeof item === 'object' && item !== null && 'path' in item) { - safePerformance.mark(`findMultiBulk:onGetAll:${item.path}:end`); - safePerformance.measure(`findMultiBulk:onGetAll:${item.path}`, `findMultiBulk:onGetAll:${item.path}:start`, `findMultiBulk:onGetAll:${item.path}:end`); - } - for (const row of rows) { - if (predicate(row, data.item)) { - results.push(createResult(row, data)); + /** + * @param {Uint8Array[]} contentBytesList + * @param {string|null} compressionDictName + * @param {(string|null)[]} [contentDictNameOverrides] + * @returns {{storedChunks: Uint8Array[], contentDictNames: string[], entryToStoredChunkIndexes: number[]}} + */ + _createTermContentStorageChunks(contentBytesList, compressionDictName, contentDictNameOverrides = []) { + if (this._termContentStorageMode === TERM_CONTENT_STORAGE_MODE_RAW_BYTES) { + return { + storedChunks: contentBytesList, + contentDictNames: contentBytesList.map((contentBytes, index) => { + const override = contentDictNameOverrides[index]; + if (typeof override === 'string' && override.length > 0) { + return override; } - } - if (++completeCount >= requiredCompleteCount) { - resolve(results); - safePerformance.mark('findMultiBulk:end'); - safePerformance.measure('findMultiBulk', 'findMultiBulk:start', 'findMultiBulk:end'); - } + return isRawTermContentBinary(contentBytes) ? + RAW_TERM_CONTENT_DICT_NAME : + (isRawTermContentSharedGlossaryBinary(contentBytes) ? RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME : 'raw'); + }), + entryToStoredChunkIndexes: contentBytesList.map((_, index) => index), }; - safePerformance.mark('findMultiBulk:getAll:start'); - for (let i = 0; i < itemCount; ++i) { - const item = items[i]; - const query = createQuery(item); - for (let j = 0; j < indexCount; ++j) { - /** @type {import('dictionary-database').FindMultiBulkData} */ - const data = {item, itemIndex: i, indexIndex: j}; - if (typeof item === 'object' && item !== null && 'path' in item) { - safePerformance.mark(`findMultiBulk:onGetAll:${item.path}:start`); - } - this._db.getAll(indexList[j], query, onGetAll(item), reject, data); + } + /** @type {Uint8Array[]} */ + const storedChunks = []; + /** @type {string[]} */ + const contentDictNames = []; + for (const contentBytes of contentBytesList) { + let storedBytes = contentBytes; + let effectiveDictName = 'raw'; + if (contentBytes.byteLength >= this._termContentCompressionMinBytes) { + const compressed = compressTermContentZstd(contentBytes, compressionDictName); + if (compressed.byteLength < contentBytes.byteLength) { + storedBytes = compressed; + effectiveDictName = compressionDictName ?? ''; } } - safePerformance.mark('findMultiBulk:getAll:end'); - safePerformance.measure('findMultiBulk:getAll', 'findMultiBulk:getAll:start', 'findMultiBulk:getAll:end'); - }); + storedChunks.push(storedBytes); + contentDictNames.push(effectiveDictName); + } + return { + storedChunks, + contentDictNames, + entryToStoredChunkIndexes: storedChunks.map((_, index) => index), + }; } /** - * @template [TRow=unknown] - * @template [TItem=unknown] - * @param {import('dictionary-database').ObjectStoreName} objectStoreName - * @param {string} indexName - * @param {TItem[]} items - * @param {import('dictionary-database').CreateQuery} createQuery - * @param {import('dictionary-database').FindPredicate} predicate - * @returns {Promise<(TRow|undefined)[]>} - */ - _findFirstBulk(objectStoreName, indexName, items, createQuery, predicate) { - return new Promise((resolve, reject) => { - const itemCount = items.length; - /** @type {(TRow|undefined)[]} */ - const results = new Array(itemCount); - if (itemCount === 0) { - resolve(results); - return; - } - - const transaction = this._db.transaction([objectStoreName], 'readonly'); - const objectStore = transaction.objectStore(objectStoreName); - const index = objectStore.index(indexName); - let completeCount = 0; - /** - * @param {TRow|undefined} row - * @param {number} itemIndex - */ - const onFind = (row, itemIndex) => { - results[itemIndex] = row; - if (++completeCount >= itemCount) { - resolve(results); - } - }; - for (let i = 0; i < itemCount; ++i) { - const item = items[i]; - const query = createQuery(item); - this._db.findFirst(index, query, onFind, reject, i, predicate, item, void 0); - } - }); + * @param {number} rowCount + * @returns {number} + */ + _getTermBulkAddBatchSizeForCount(rowCount) { + const baseline = this._termBulkAddBatchSize; + if (!this._adaptiveTermBulkAddBatchSize) { + return baseline; + } + let candidate = baseline; + if (rowCount >= 300000) { + candidate = 75000; + } else if (rowCount >= 160000) { + candidate = 75000; + } else if (rowCount >= 60000) { + candidate = 50000; + } else if (rowCount >= 20000) { + candidate = 37500; + } + return Math.max(1024, Math.min(100000, Math.max(baseline, candidate))); } /** - * @param {import('dictionary-database').MatchType} matchType - * @param {import('dictionary-database').DatabaseTermEntryWithId} row - * @param {import('dictionary-database').FindMultiBulkData} data - * @returns {import('dictionary-database').TermEntry} + * @param {Error} error + * @returns {boolean} */ - _createTermGeneric(matchType, row, data) { - const matchSourceIsTerm = (data.indexIndex === 0); - const matchSource = (matchSourceIsTerm ? 'term' : 'reading'); - if ((matchSourceIsTerm ? row.expression : row.reading) === data.item) { - matchType = 'exact'; - } - return this._createTerm(matchSource, matchType, row, data.itemIndex); + _isRetryableBeginImmediateError(error) { + return /SQLITE_BUSY|SQLITE_LOCKED|database is locked/i.test(error.message); } /** - * @param {import('dictionary-database').DatabaseTermEntryWithId} row - * @param {import('dictionary-database').FindMultiBulkData} data - * @returns {import('dictionary-database').TermEntry} + * @param {number} ms + * @returns {Promise} */ - _createTermExact(row, data) { - return this._createTerm('term', 'exact', row, data.itemIndex); + async _sleep(ms) { + if (ms <= 0) { return; } + await new Promise((resolve) => { + setTimeout(resolve, ms); + }); } /** - * @param {import('dictionary-database').DatabaseTermEntryWithId} row - * @param {import('dictionary-database').FindMultiBulkData} data - * @returns {import('dictionary-database').TermEntry} + * @param {import('@sqlite.org/sqlite-wasm').Database} db + * @returns {Promise} + * @throws {Error} */ - _createTermSequenceExact(row, data) { - return this._createTerm('sequence', 'exact', row, data.itemIndex); + async _beginImmediateTransaction(db) { + if (!this._retryBeginImmediateTransaction) { + db.exec('BEGIN IMMEDIATE'); + return; + } + const retryBackoffMs = [0, 8, 16, 32, 64, 128]; + /** @type {Error|null} */ + let lastError = null; + for (let i = 0; i < retryBackoffMs.length; ++i) { + await this._sleep(retryBackoffMs[i]); + try { + db.exec('BEGIN IMMEDIATE'); + return; + } catch (e) { + const error = toError(e); + lastError = error; + if (!this._isRetryableBeginImmediateError(error) || i >= (retryBackoffMs.length - 1)) { + throw error; + } + } + } + if (lastError !== null) { + throw lastError; + } + throw new Error('BEGIN IMMEDIATE failed with unknown error'); + } + + /** */ + _applyRuntimePragmas() { + const db = this._requireDb(); + db.exec('PRAGMA journal_mode = WAL'); + db.exec('PRAGMA synchronous = NORMAL'); + db.exec('PRAGMA temp_store = MEMORY'); + db.exec('PRAGMA foreign_keys = OFF'); + db.exec('PRAGMA wal_autocheckpoint = 1000'); + db.exec('PRAGMA cache_size = -16384'); + db.exec('PRAGMA cache_spill = ON'); + db.exec('PRAGMA locking_mode = NORMAL'); + } + + /** */ + _applyImportPragmas() { + const db = this._requireDb(); + db.exec('PRAGMA journal_mode = WAL'); + db.exec('PRAGMA synchronous = NORMAL'); + db.exec('PRAGMA temp_store = MEMORY'); + db.exec('PRAGMA foreign_keys = OFF'); + db.exec('PRAGMA cache_size = -131072'); + db.exec('PRAGMA cache_spill = OFF'); + db.exec('PRAGMA wal_autocheckpoint = 0'); + // OPFS-backed sqlite handles can see generic I/O/CANTOPEN failures under + // contention when EXCLUSIVE mode is held for long imports. Keep NORMAL + // here so concurrent extension handles can continue to cooperate. + db.exec('PRAGMA locking_mode = NORMAL'); } /** @@ -806,11 +5202,29 @@ export class DictionaryDatabase { _createTermMeta({expression: term, mode, data, dictionary}, {itemIndex: index}) { switch (mode) { case 'freq': - return {index, term, mode, data, dictionary}; + return { + index, + term, + mode: 'freq', + data: /** @type {import('dictionary-data').GenericFrequencyData | import('dictionary-data').TermMetaFrequencyDataWithReading} */ (data), + dictionary, + }; case 'pitch': - return {index, term, mode, data, dictionary}; + return { + index, + term, + mode: 'pitch', + data: /** @type {import('dictionary-data').TermMetaPitchData} */ (data), + dictionary, + }; case 'ipa': - return {index, term, mode, data, dictionary}; + return { + index, + term, + mode: 'ipa', + data: /** @type {import('dictionary-data').TermMetaPhoneticData} */ (data), + dictionary, + }; default: throw new Error(`Unknown mode: ${mode}`); } @@ -834,50 +5248,4 @@ export class DictionaryDatabase { const {dictionary, path, mediaType, width, height, content} = row; return {index, dictionary, path, mediaType, width, height, content}; } - - /** - * @param {import('dictionary-database').MediaDataArrayBufferContent} row - * @param {import('dictionary-database').FindMultiBulkData} data - * @returns {import('dictionary-database').DrawMedia} - */ - _createDrawMedia(row, {itemIndex: index, item: {canvasIndexes, canvasWidth, canvasHeight, generation}}) { - const {dictionary, path, mediaType, width, height, content} = row; - return {index, dictionary, path, mediaType, width, height, content, canvasIndexes, canvasWidth, canvasHeight, generation}; - } - - /** - * @param {unknown} field - * @returns {string[]} - */ - _splitField(field) { - return typeof field === 'string' && field.length > 0 ? field.split(' ') : []; - } - - // Parent-Worker API - - /** - * @param {MessagePort} port - */ - async connectToDatabaseWorker(port) { - if (this._worker !== null) { - // executes outside of worker - this._worker.postMessage({action: 'connectToDatabaseWorker'}, [port]); - return; - } - // executes inside worker - port.onmessage = (/** @type {MessageEvent} */event) => { - const {action, params} = event.data; - return invokeApiMapHandler(this._apiMap, action, params, [port], () => {}); - }; - port.onmessageerror = (event) => { - const error = new ExtensionError('DictionaryDatabase: Error receiving message from main thread'); - error.data = event; - log.error(error); - }; - } - - /** @type {import('dictionary-database').ApiHandler<'drawMedia'>} */ - _onDrawMedia(params, port) { - void this.drawMedia(params.requests, port); - } } diff --git a/ext/js/dictionary/dictionary-importer.js b/ext/js/dictionary/dictionary-importer.js index 4a37dd33b6..4a3b78b72b 100644 --- a/ext/js/dictionary/dictionary-importer.js +++ b/ext/js/dictionary/dictionary-importer.js @@ -16,28 +16,182 @@ * along with this program. If not, see . */ -import * as ajvSchemas0 from '../../lib/validate-schemas.js'; import { BlobWriter as BlobWriter0, TextWriter as TextWriter0, Uint8ArrayReader as Uint8ArrayReader0, + Uint8ArrayWriter as Uint8ArrayWriter0, ZipReader as ZipReader0, configure, } from '../../lib/zip.js'; -import {ExtensionError} from '../core/extension-error.js'; import {parseJson} from '../core/json.js'; import {toError} from '../core/to-error.js'; import {stringReverse} from '../core/utilities.js'; import {getFileExtensionFromImageMediaType, getImageMediaTypeFromFileName} from '../media/media-util.js'; +import { + decodeRawTermContentBinary, + encodeRawTermContentBinary, + RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME, + RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME, + isRawTermContentSharedGlossaryBinary, + rebaseRawTermContentSharedGlossaryBinary, +} from './raw-term-content.js'; +import { + initializeTermContentZstd, + logTermContentZstdError, +} from './zstd-term-content.js'; +import {decompress as zstdDecompress} from '../../lib/zstd-wasm.js'; import {compareRevisions} from './dictionary-data-util.js'; +import {consumeLastTermBankWasmParseProfile, parseTermBankWithWasmChunks} from './term-bank-wasm-parser.js'; -const ajvSchemas = /** @type {import('dictionary-importer').CompiledSchemaValidators} */ (/** @type {unknown} */ (ajvSchemas0)); const BlobWriter = /** @type {typeof import('@zip.js/zip.js').BlobWriter} */ (/** @type {unknown} */ (BlobWriter0)); const TextWriter = /** @type {typeof import('@zip.js/zip.js').TextWriter} */ (/** @type {unknown} */ (TextWriter0)); const Uint8ArrayReader = /** @type {typeof import('@zip.js/zip.js').Uint8ArrayReader} */ (/** @type {unknown} */ (Uint8ArrayReader0)); +const Uint8ArrayWriter = /** @type {typeof import('@zip.js/zip.js').Uint8ArrayWriter} */ (/** @type {unknown} */ (Uint8ArrayWriter0)); const ZipReader = /** @type {typeof import('@zip.js/zip.js').ZipReader} */ (/** @type {unknown} */ (ZipReader0)); const INDEX_FILE_NAME = 'index.json'; +const SUPPORTED_INDEX_VERSIONS = new Set([1, 3]); +const JSON_QUOTED_STRING_CACHE_MAX_ENTRIES = 8192; +const TERM_BANK_WASM_ROW_CHUNK_SIZE = 2048; +const TERM_BANK_WASM_INITIAL_META_CAPACITY_DIVISOR = 24; +const TERM_BANK_WASM_INITIAL_CONTENT_BYTES_PER_ROW = 96; +const TERM_ARTIFACT_ROW_CHUNK_SIZE = 6144; +const NO_MEDIA_FAST_PATH_TERM_BANK_WASM_ROW_CHUNK_SIZE = 8192; +const ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_THRESHOLD_BYTES = 8 * 1024 * 1024; +const ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_UPPER_BOUND_BYTES = 128 * 1024 * 1024; +const ADAPTIVE_TERM_BANK_WASM_INITIAL_META_CAPACITY_DIVISOR = 18; +const ADAPTIVE_TERM_BANK_WASM_INITIAL_CONTENT_BYTES_PER_ROW = 128; +const REVERSE_STRING_CACHE_MAX_ENTRIES = 4096; +const TERM_BANK_ARTIFACT_MAGIC = 'MBTB0001'; +const TERM_BANK_ARTIFACT_MAGIC_BYTES = TERM_BANK_ARTIFACT_MAGIC.length; +const TERM_BANK_ARTIFACT_MANIFEST_FILE = 'yomitan-import-artifact.json'; +const TERM_BANK_PACKED_ARTIFACT_FILE = 'yomitan-term-banks-packed.bin'; +const TERM_BANK_SHARED_GLOSSARY_ARTIFACT_FILE = 'yomitan-term-glossary-shared.bin'; +const TERM_ARTIFACT_PRELOAD_CONCURRENCY = 4; +const HEX_BYTE_TABLE = Array.from({length: 256}, (_, i) => i.toString(16).padStart(2, '0')); +/** @type {import('dictionary-data').TermGlossary[]} */ +const EMPTY_TERM_GLOSSARY = []; +Object.freeze(EMPTY_TERM_GLOSSARY); + +/** + * @param {string} value + * @returns {string} + */ +function reverseUtf16PreserveSurrogates(value) { + const ii = value.length; + if (ii <= 1) { + return value; + } + // Most dictionary terms are BMP-only; use a cheaper code-unit reversal when no surrogate code units are present. + let hasSurrogates = false; + for (let i = 0; i < ii; ++i) { + if ((value.charCodeAt(i) & 0xf800) === 0xd800) { + hasSurrogates = true; + break; + } + } + if (!hasSurrogates) { + /** @type {string[]} */ + const parts = new Array(ii); + for (let i = 0; i < ii; ++i) { + parts[i] = value[ii - 1 - i]; + } + return parts.join(''); + } + /** @type {string[]} */ + const parts = []; + parts.length = ii; + let outIndex = 0; + for (let i = ii - 1; i >= 0; --i) { + const c = value.charCodeAt(i); + if ( + c >= 0xdc00 && c <= 0xdfff && + i > 0 + ) { + const prev = value.charCodeAt(i - 1); + if (prev >= 0xd800 && prev <= 0xdbff) { + parts[outIndex++] = value.slice(i - 1, i + 1); + --i; + continue; + } + } + parts[outIndex++] = value[i]; + } + parts.length = outIndex; + return parts.join(''); +} + +/** + * @param {number} h1 + * @param {number} h2 + * @returns {string} + */ +function hashPairToHex(h1, h2) { + const a = h1 >>> 0; + const b = h2 >>> 0; + return ( + HEX_BYTE_TABLE[(a >>> 24) & 0xff] + + HEX_BYTE_TABLE[(a >>> 16) & 0xff] + + HEX_BYTE_TABLE[(a >>> 8) & 0xff] + + HEX_BYTE_TABLE[a & 0xff] + + HEX_BYTE_TABLE[(b >>> 24) & 0xff] + + HEX_BYTE_TABLE[(b >>> 16) & 0xff] + + HEX_BYTE_TABLE[(b >>> 8) & 0xff] + + HEX_BYTE_TABLE[b & 0xff] + ); +} + +/** + * @param {{termEntryContentHash?: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes?: Uint8Array}} entry + * @returns {boolean} + */ +function hasPrecomputedTermEntryContent(entry) { + return ( + ( + typeof entry.termEntryContentHash === 'string' && + entry.termEntryContentHash.length > 0 + ) || + ( + Number.isInteger(entry.termEntryContentHash1) && + Number.isInteger(entry.termEntryContentHash2) + ) + ) && entry.termEntryContentBytes instanceof Uint8Array; +} + +/** + * @typedef {object} ParsedTermBankChunkRow + * @property {string} expression + * @property {string} reading + * @property {string} definitionTags + * @property {string} rules + * @property {number} score + * @property {string} glossaryJson + * @property {Uint8Array} [glossaryJsonBytes] + * @property {boolean} [glossaryMayContainMedia] + * @property {number|null} sequence + * @property {string} termTags + * @property {string} [termEntryContentHash] + * @property {number} [termEntryContentHash1] + * @property {number} [termEntryContentHash2] + * @property {Uint8Array} termEntryContentBytes + */ + +/** + * @param {Uint8Array} bytes + * @param {number} aStart + * @param {number} bStart + * @param {number} length + * @returns {boolean} + */ +function byteRangeEqual(bytes, aStart, bStart, length) { + for (let i = 0; i < length; ++i) { + if (bytes[aStart + i] !== bytes[bStart + i]) { + return false; + } + } + return true; +} export class DictionaryImporter { /** @@ -51,6 +205,78 @@ export class DictionaryImporter { this._onProgress = typeof onProgress === 'function' ? onProgress : () => {}; /** @type {import('dictionary-importer').ProgressData} */ this._progressData = this._createProgressData(); + /** @type {number} */ + this._lastProgressTimestamp = 0; + /** @type {boolean} */ + this._skipImageMetadata = false; + /** @type {boolean} */ + this._skipMediaImport = false; + /** @type {number} */ + this._mediaResolutionConcurrency = 8; + /** @type {Map>} */ + this._pendingImageMediaByPath = new Map(); + /** @type {Map} */ + this._imageMetadataByPath = new Map(); + /** @type {boolean} */ + this._debugImportLogging = false; + /** @type {TextEncoder} */ + this._textEncoder = new TextEncoder(); + /** @type {TextDecoder} */ + this._textDecoder = new TextDecoder(); + /** @type {Map} */ + this._jsonQuotedStringCache = new Map(); + /** @type {Map} */ + this._utf8StringBytesCache = new Map(); + /** @type {number} */ + this._progressMinIntervalMs = 1000; + /** @type {boolean} */ + this._adaptiveTermBulkAddBatchSize = true; + /** @type {boolean} */ + this._glossaryMediaFastScan = false; + /** @type {boolean} */ + this._lazyGlossaryDecodeForMedia = false; + /** @type {boolean} */ + this._reuseExpressionReverseForReading = true; + /** @type {boolean} */ + this._disableTermBankWasmFastPath = false; + /** @type {boolean} */ + this._wasmCanonicalRowsFastPath = true; + /** @type {boolean} */ + this._wasmPassThroughTermContent = true; + /** @type {number} */ + this._termBankWasmRowChunkSize = TERM_BANK_WASM_ROW_CHUNK_SIZE; + /** @type {number} */ + this._termBankWasmInitialMetaCapacityDivisor = TERM_BANK_WASM_INITIAL_META_CAPACITY_DIVISOR; + /** @type {number} */ + this._termBankWasmInitialContentBytesPerRow = TERM_BANK_WASM_INITIAL_CONTENT_BYTES_PER_ROW; + /** @type {boolean} */ + this._adaptiveTermBankWasmRowChunkSize = false; + /** @type {boolean} */ + this._adaptiveTermBankWasmRowChunkSizeTiered = false; + /** @type {boolean} */ + this._adaptiveTermBankWasmInitialCapacity = false; + /** @type {boolean} */ + this._streamTermArtifactChunks = false; + /** @type {number} */ + this._termArtifactRowChunkSize = TERM_ARTIFACT_ROW_CHUNK_SIZE; + /** @type {boolean} */ + this._wasmSkipUnusedTermContentEncoding = true; + /** @type {boolean} */ + this._wasmReuseExpressionForReadingDecode = true; + /** @type {boolean} */ + this._wasmPreallocateChunkRows = false; + /** @type {boolean} */ + this._usePrecomputedContentForMediaRows = false; + /** @type {boolean} */ + this._leanCanonicalTermEntryObjects = false; + /** @type {boolean} */ + this._cacheReverseStrings = true; + /** @type {number} */ + this._reverseStringCacheMaxEntries = REVERSE_STRING_CACHE_MAX_ENTRIES; + /** @type {Map} */ + this._reverseStringCache = new Map(); + /** @type {boolean} */ + this._fastPrefixReverse = true; } /** @@ -69,32 +295,93 @@ export class DictionaryImporter { /** @type {Error[]} */ const errors = []; - const maxTransactionLength = 1000; + const maxTransactionLength = 262144; const bulkAddProgressAllowance = 1000; + const enableTermEntryContentDedup = details.enableTermEntryContentDedup !== false; + const termContentStorageMode = (details.termContentStorageMode === 'raw-bytes') ? + details.termContentStorageMode : + 'baseline'; + this._skipImageMetadata = details.skipImageMetadata === true; + this._skipMediaImport = details.skipMediaImport === true; + this._mediaResolutionConcurrency = Math.max(1, Math.min(32, Math.trunc(details.mediaResolutionConcurrency ?? 8))); + this._debugImportLogging = details.debugImportLogging === true; + this._pendingImageMediaByPath.clear(); + this._imageMetadataByPath.clear(); + this._jsonQuotedStringCache.clear(); + this._utf8StringBytesCache.clear(); + this._reverseStringCache.clear(); + dictionaryDatabase.setImportOptimizationFlags({ + termContentStorageMode, + }); + const tImportStart = Date.now(); + /** @type {Array<{phase: string, elapsedMs: number, details?: Record}>} */ + const phaseTimings = []; + /** @type {{termParseMs: number, termSerializationMs: number, bulkAddTermsMs: number, bulkAddTagsMetaMs: number, mediaResolveMs: number, mediaWriteMs: number, termFileNonParseWriteMs: number, termMetaReadMs: number, kanjiReadMs: number, kanjiMetaReadMs: number, tagReadMs: number}} */ + const step4TimingBreakdown = { + termParseMs: 0, + termSerializationMs: 0, + bulkAddTermsMs: 0, + bulkAddTagsMetaMs: 0, + mediaResolveMs: 0, + mediaWriteMs: 0, + termFileNonParseWriteMs: 0, + termMetaReadMs: 0, + kanjiReadMs: 0, + kanjiMetaReadMs: 0, + tagReadMs: 0, + }; + /** @type {{parserProfile?: Record|null, materializationMs?: number, chunkSinkMs?: number, chunkCount?: number, totalRows?: number}|null} */ + let lastFastTermBankReadProfile = null; + /** @type {{readBytesMs?: number, decodeRowsMs?: number, reverseRowsMs?: number, metadataRebaseMs?: number, chunkSinkMs?: number, chunkCount?: number, totalRows?: number, rowChunkSize?: number}|null} */ + let lastArtifactTermBankReadProfile = null; + + /** + * @param {string} phase + * @param {number} startTime + * @param {Record} [phaseDetails] + */ + const recordPhaseTiming = (phase, startTime, phaseDetails = {}) => { + const elapsedMs = Math.max(0, Date.now() - startTime); + const phaseTiming = {phase, elapsedMs, details: phaseDetails}; + phaseTimings.push(phaseTiming); + this._logImport(`phase ${phase} ${elapsedMs}ms details=${JSON.stringify(phaseDetails)}`); + }; /** * @template {import('dictionary-database').ObjectStoreName} T * @param {T} objectStoreName * @param {import('dictionary-database').ObjectStoreData[]} entries + * @param {{trackProgress?: boolean}} [options] */ - const bulkAdd = async (objectStoreName, entries) => { + const bulkAdd = async (objectStoreName, entries, {trackProgress = true} = {}) => { const entryCount = entries.length; + let progressIndexIncrease = 0; + if (trackProgress) { + progressIndexIncrease = bulkAddProgressAllowance / Math.ceil(entryCount / maxTransactionLength); + if (entryCount < maxTransactionLength) { progressIndexIncrease = bulkAddProgressAllowance; } + if (entryCount === 0) { + this._progressData.index += progressIndexIncrease; + } + } - let progressIndexIncrease = bulkAddProgressAllowance / Math.ceil(entryCount / maxTransactionLength); - if (entryCount < maxTransactionLength) { progressIndexIncrease = bulkAddProgressAllowance; } - if (entryCount === 0) { this._progressData.index += progressIndexIncrease; } - - for (let i = 0; i < entryCount; i += maxTransactionLength) { + for (let i = 0, chunkIndex = 0; i < entryCount; i += maxTransactionLength, ++chunkIndex) { const count = Math.min(maxTransactionLength, entryCount - i); + const tChunk = Date.now(); try { await dictionaryDatabase.bulkAdd(objectStoreName, entries, i, count); } catch (e) { - errors.push(toError(e)); + throw toError(e); } + this._logImport( + `bulkAdd ${objectStoreName} chunk=${chunkIndex + 1} ` + + `rows=${count} elapsed=${Date.now() - tChunk}ms`, + ); - this._progressData.index += progressIndexIncrease; - this._progress(); + if (trackProgress) { + this._progressData.index += progressIndexIncrease; + this._progress(); + } } }; @@ -108,8 +395,28 @@ export class DictionaryImporter { }); // Read archive - const fileMap = await this._getFilesFromArchive(archiveContent); - const index = await this._readAndValidateIndex(fileMap); + const tArchiveStart = Date.now(); + /** @type {import('dictionary-importer').ArchiveFileMap} */ + let fileMap; + /** @type {import('dictionary-data').Index} */ + let index; + try { + fileMap = await this._getFilesFromArchive(archiveContent); + index = await this._readAndValidateIndex(fileMap); + } catch (e) { + recordPhaseTiming('archive-and-index', tArchiveStart, {ok: false}); + return { + result: null, + errors: [toError(e)], + debug: {phaseTimings}, + }; + } + recordPhaseTiming('archive-and-index', tArchiveStart, { + ok: true, + fileCount: fileMap.size, + indexVersion: typeof index.version === 'number' ? index.version : null, + }); + this._logImport(`archive+index ${Date.now() - tArchiveStart}ms files=${fileMap.size}`); const dictionaryTitle = index.title; const version = /** @type {import('dictionary-data').IndexVersion} */ (index.version); @@ -119,37 +426,147 @@ export class DictionaryImporter { return { errors: [new Error(`Dictionary ${dictionaryTitle} is already imported, skipped it.`)], result: null, + debug: {phaseTimings}, }; } - - // Load schemas - this._progressNextStep(0); - const dataBankSchemas = this._getDataBankSchemas(version); + dictionaryDatabase.setTermEntryContentDedupEnabled(enableTermEntryContentDedup); + dictionaryDatabase.setImportDebugLogging(this._debugImportLogging); // Files /** @type {import('dictionary-importer').QueryDetails} */ const queryDetails = [ ['termFiles', /^term_bank_(\d+)\.json$/], + ['termArtifactFiles', /^term_bank_(\d+)\.mbtb$/], ['termMetaFiles', /^term_meta_bank_(\d+)\.json$/], ['kanjiFiles', /^kanji_bank_(\d+)\.json$/], ['kanjiMetaFiles', /^kanji_meta_bank_(\d+)\.json$/], ['tagFiles', /^tag_bank_(\d+)\.json$/], ]; - const {termFiles, termMetaFiles, kanjiFiles, kanjiMetaFiles, tagFiles} = Object.fromEntries(this._getArchiveFiles(fileMap, queryDetails)); + const {termFiles, termArtifactFiles, termMetaFiles, kanjiFiles, kanjiMetaFiles, tagFiles} = Object.fromEntries(this._getArchiveFiles(fileMap, queryDetails)); + const useTermArtifactFiles = termArtifactFiles.length > 0; + const termArtifactManifest = (useTermArtifactFiles || fileMap.has(TERM_BANK_ARTIFACT_MANIFEST_FILE)) ? + await this._readTermArtifactManifest(fileMap) : + null; + const effectiveTermContentStorageMode = ( + termArtifactManifest !== null && + ( + termArtifactManifest.termContentMode === RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME || + termArtifactManifest.termContentMode === 'raw-v4' + ) + ) ? + 'raw-bytes' : + termContentStorageMode; + if (effectiveTermContentStorageMode !== termContentStorageMode) { + dictionaryDatabase.setImportOptimizationFlags({ + termContentStorageMode: effectiveTermContentStorageMode, + }); + } + const packedTermArtifactEntry = ( + termArtifactManifest !== null && + typeof termArtifactManifest.packedFileName === 'string' + ) ? + fileMap.get(termArtifactManifest.packedFileName) : + fileMap.get(TERM_BANK_PACKED_ARTIFACT_FILE); + const sharedGlossaryArtifactEntry = ( + termArtifactManifest !== null && + typeof termArtifactManifest.sharedGlossaryFileName === 'string' + ) ? + fileMap.get(termArtifactManifest.sharedGlossaryFileName) : + fileMap.get(TERM_BANK_SHARED_GLOSSARY_ARTIFACT_FILE); + const sharedGlossaryPackedOffset = termArtifactManifest?.sharedGlossaryPackedOffset ?? null; + const sharedGlossaryPackedLength = termArtifactManifest?.sharedGlossaryPackedLength ?? null; + const sharedGlossaryCompression = termArtifactManifest?.sharedGlossaryCompression ?? null; + const sharedGlossaryUncompressedLength = termArtifactManifest?.sharedGlossaryUncompressedLength ?? null; + /** @type {Uint8Array|null} */ + let packedTermArtifactBytes = null; + /** @type {Uint8Array|null} */ + let sharedGlossaryArtifactBytes = null; + /** @type {Map|null} */ + let preloadedTermArtifactBytes = null; + let packedTermArtifactPreloadMs = 0; + let termArtifactPreloadMs = 0; + let sharedGlossaryArtifactPreloadMs = 0; + if (useTermArtifactFiles || typeof packedTermArtifactEntry !== 'undefined') { + if (typeof packedTermArtifactEntry !== 'undefined') { + const tPackedArtifactReadStart = Date.now(); + packedTermArtifactBytes = await this._getData(/** @type {import('@zip.js/zip.js').Entry} */ (packedTermArtifactEntry), new Uint8ArrayWriter()); + packedTermArtifactPreloadMs = Math.max(0, Date.now() - tPackedArtifactReadStart); + this._logImport(`packed term artifact preload ${packedTermArtifactPreloadMs}ms bytes=${packedTermArtifactBytes.byteLength}`); + } else if (termArtifactFiles.length > 1) { + const tPreloadArtifactStart = Date.now(); + preloadedTermArtifactBytes = await this._preloadTermArtifactFiles(termArtifactFiles); + termArtifactPreloadMs = Math.max(0, Date.now() - tPreloadArtifactStart); + this._logImport(`term artifact preload ${termArtifactPreloadMs}ms files=${preloadedTermArtifactBytes.size}`); + } + } + if ( + packedTermArtifactBytes instanceof Uint8Array && + Number.isInteger(sharedGlossaryPackedOffset) && + Number.isInteger(sharedGlossaryPackedLength) + ) { + const packedSharedGlossaryOffset = /** @type {number} */ (sharedGlossaryPackedOffset); + const packedSharedGlossaryLength = /** @type {number} */ (sharedGlossaryPackedLength); + if ( + packedSharedGlossaryOffset >= 0 && + packedSharedGlossaryLength > 0 && + packedSharedGlossaryOffset + packedSharedGlossaryLength <= packedTermArtifactBytes.byteLength + ) { + sharedGlossaryArtifactBytes = packedTermArtifactBytes.subarray( + packedSharedGlossaryOffset, + packedSharedGlossaryOffset + packedSharedGlossaryLength, + ); + } + } + if (sharedGlossaryArtifactBytes === null && typeof sharedGlossaryArtifactEntry !== 'undefined') { + const tSharedGlossaryReadStart = Date.now(); + sharedGlossaryArtifactBytes = await this._getData(/** @type {import('@zip.js/zip.js').Entry} */ (sharedGlossaryArtifactEntry), new Uint8ArrayWriter()); + sharedGlossaryArtifactPreloadMs = Math.max(0, Date.now() - tSharedGlossaryReadStart); + this._logImport(`shared glossary artifact preload ${sharedGlossaryArtifactPreloadMs}ms bytes=${sharedGlossaryArtifactBytes.byteLength}`); + } + const useCompressedSharedGlossaryArtifact = termArtifactManifest?.termContentMode === RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME; + if ( + sharedGlossaryArtifactBytes instanceof Uint8Array && + sharedGlossaryCompression === 'zstd' && + !useCompressedSharedGlossaryArtifact + ) { + try { + await initializeTermContentZstd(); + const defaultHeapSize = ( + Number.isInteger(sharedGlossaryUncompressedLength) && + /** @type {number} */ (sharedGlossaryUncompressedLength) > 0 + ) ? + /** @type {number} */ (sharedGlossaryUncompressedLength) : + (sharedGlossaryArtifactBytes.byteLength * 16); + sharedGlossaryArtifactBytes = zstdDecompress(sharedGlossaryArtifactBytes, {defaultHeapSize}); + } catch (e) { + logTermContentZstdError(e); + throw e; + } + } + const usePackedTermArtifact = ( + packedTermArtifactBytes !== null && + termArtifactManifest !== null && + termArtifactManifest.termBanksByArtifact.size > 0 + ); + /** @type {Array} */ + const activeTermFiles = usePackedTermArtifact ? + this._createPackedTermArtifactFiles(termArtifactManifest.termBanksByArtifact) : + (useTermArtifactFiles ? termArtifactFiles : termFiles); + this._logImport( + `banks terms=${activeTermFiles.length} termArtifacts=${termArtifactFiles.length} ` + + `termMeta=${termMetaFiles.length} kanji=${kanjiFiles.length} kanjiMeta=${kanjiMetaFiles.length} tags=${tagFiles.length} ` + + `useArtifactTerms=${String(useTermArtifactFiles || usePackedTermArtifact)} packedTermArtifact=${String(packedTermArtifactBytes !== null)} ` + + `preloadedTermArtifacts=${String(preloadedTermArtifactBytes !== null)}`, + ); - // Load data + // Load and import data const prefixWildcardsSupported = !!details.prefixWildcardsSupported; - this._progressNextStep(termFiles.length + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length); - - for (const termFile of termFiles) { await this._validateFile(termFile, dataBankSchemas[0]); } - for (const termMetaFile of termMetaFiles) { await this._validateFile(termMetaFile, dataBankSchemas[1]); } - for (const kanjiFile of kanjiFiles) { await this._validateFile(kanjiFile, dataBankSchemas[2]); } - for (const kanjiMetaFile of kanjiMetaFiles) { await this._validateFile(kanjiMetaFile, dataBankSchemas[3]); } - for (const tagFile of tagFiles) { await this._validateFile(tagFile, dataBankSchemas[4]); } - - // termFiles is doubled due to media importing - this._progressNextStep((termFiles.length * 2 + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length) * bulkAddProgressAllowance); + // Term files are doubled due to media importing. + // This transition enters "Importing data". + this._progressNextStep((activeTermFiles.length * 2 + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length) * bulkAddProgressAllowance); + const previousProgressInterval = this._progressMinIntervalMs; + this._setProgressInterval(100); let importSuccess = false; @@ -168,67 +585,429 @@ export class DictionaryImporter { let summaryDetails = {prefixWildcardsSupported, counts, styles: '', yomitanVersion, importSuccess}; let summary = this._createSummary(dictionaryTitle, version, index, summaryDetails); - const dictionarySummaryAdd = await dictionaryDatabase.addWithResult('dictionaries', summary); - /** @type {Promise} */ - const dictionarySummaryResult = new Promise((resolve, reject) => { - dictionarySummaryAdd.onerror = () => reject(void 0); - dictionarySummaryAdd.onsuccess = () => resolve(dictionarySummaryAdd.result); - }); + const dictionarySummaryPrimaryKey = await dictionaryDatabase.addWithResult('dictionaries', summary); + let styles = ''; + let importFailed = false; + let sharedGlossaryArtifactBaseOffset = 0; + let sharedGlossaryArtifactAppendMs = 0; + const tImportBanksStart = Date.now(); try { - const uniqueMediaPaths = new Set(); - for (const termFile of termFiles) { - /** @type {import('dictionary-importer').ImportRequirement[]} */ - const requirements = []; - let termList = await ( - version === 1 ? - this._readFileSequence([termFile], this._convertTermBankEntryV1.bind(this), dictionaryTitle) : - this._readFileSequence([termFile], this._convertTermBankEntryV3.bind(this), dictionaryTitle) + await dictionaryDatabase.startBulkImport(); + if (sharedGlossaryArtifactBytes instanceof Uint8Array && sharedGlossaryArtifactBytes.byteLength > 0) { + const tSharedGlossaryAppendStart = Date.now(); + const sharedGlossarySpan = await dictionaryDatabase.appendRawSharedGlossaryArtifact( + dictionaryTitle, + sharedGlossaryArtifactBytes, + useCompressedSharedGlossaryArtifact ? RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME : RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME, + Number.isInteger(sharedGlossaryUncompressedLength) ? /** @type {number} */ (sharedGlossaryUncompressedLength) : sharedGlossaryArtifactBytes.byteLength, ); - - // Prefix wildcard support - if (prefixWildcardsSupported) { - for (const entry of termList) { - entry.expressionReverse = stringReverse(entry.expression); - entry.readingReverse = stringReverse(entry.reading); - } + if (!useCompressedSharedGlossaryArtifact) { + sharedGlossaryArtifactBaseOffset = sharedGlossarySpan.offset; + } + sharedGlossaryArtifactAppendMs = Math.max(0, Date.now() - tSharedGlossaryAppendStart); + } + const hasArchiveImageMediaFiles = this._archiveHasImageMediaFiles(fileMap); + const useMediaPipeline = ( + !this._skipMediaImport && + !useTermArtifactFiles && + hasArchiveImageMediaFiles + ); + this._logImport( + `media pipeline enabled=${String(useMediaPipeline)} skipMediaImport=${String(this._skipMediaImport)} ` + + `hasArchiveImageMediaFiles=${String(hasArchiveImageMediaFiles)}`, + ); + const uniqueMediaPaths = useMediaPipeline ? new Set() : null; + const termFileProgressAllowance = bulkAddProgressAllowance * 2; + const step4ArtifactPreloadMs = ( + packedTermArtifactPreloadMs + + termArtifactPreloadMs + + sharedGlossaryArtifactPreloadMs + ); + let step4ArtifactReadBytesMs = 0; + let step4ArtifactMetadataRebaseMs = 0; + let step4ArtifactMetadataAppendMs = 0; + const step4SharedGlossaryAppendMs = sharedGlossaryArtifactAppendMs; + /** + * @param {number} startIndex + * @param {number} processedRows + * @param {number} totalRows + * @param {boolean} [finalize] + */ + const updateStreamedTermFileProgress = (startIndex, processedRows, totalRows, finalize = false) => { + const normalizedTotalRows = Number.isFinite(totalRows) ? Math.max(1, Math.trunc(totalRows)) : 1; + const normalizedProcessedRows = finalize ? + normalizedTotalRows : + (Number.isFinite(processedRows) ? Math.max(0, Math.min(normalizedTotalRows, Math.trunc(processedRows))) : 0); + const target = Math.max( + this._progressData.index, + startIndex + Math.floor((normalizedProcessedRows / normalizedTotalRows) * termFileProgressAllowance), + ); + const upperBound = Math.min(this._progressData.count, startIndex + termFileProgressAllowance); + const clampedTarget = Math.min(target, upperBound); + if (clampedTarget > this._progressData.index) { + this._progressData.index = clampedTarget; + this._progress(); } + }; + /** + * @param {{filename: string}} termFile + * @param {import('dictionary-database').DatabaseTermEntry[]} termList + * @param {import('dictionary-importer').ImportRequirement[]|null} requirements + * @param {{processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}|null} streamedProgress + * @param {number} streamedProgressStartIndex + * @returns {Promise<{mediaResolveMs: number, mediaWriteMs: number, serializationMs: number, bulkAddTermsMs: number, contentAppendMs: number, termRecordBuildMs: number, termRecordEncodeMs: number, termRecordWriteMs: number, termsVtabInsertMs: number}>} + */ + const processTermChunk = async (termFile, termList, requirements, streamedProgress = null, streamedProgressStartIndex = 0) => { + const trackProgress = streamedProgress === null; + let mediaResolveMs = 0; + let mediaWriteMs = 0; + let serializationMs = 0; + let bulkAddTermsMs = 0; + let contentAppendMs = 0; + let termRecordBuildMs = 0; + let termRecordEncodeMs = 0; + let termRecordWriteMs = 0; + let termsVtabInsertMs = 0; + if (useMediaPipeline && requirements !== null && uniqueMediaPaths !== null) { + /** @type {import('dictionary-importer').ImportRequirement[]} */ + const alreadyAddedRequirements = []; + /** @type {import('dictionary-importer').ImportRequirement[]} */ + const notAddedRequirements = []; + for (const requirement of requirements) { + const mediaPath = requirement.source.path; + if (uniqueMediaPaths.has(mediaPath)) { + alreadyAddedRequirements.push(requirement); + continue; + } + uniqueMediaPaths.add(mediaPath); + notAddedRequirements.push(requirement); + } - // Extended data support - for (let i = 0, ii = termList.length; i < ii; ++i) { - const entry = termList[i]; - const glossaryList = entry.glossary; - for (let j = 0, jj = glossaryList.length; j < jj; ++j) { - const glossary = glossaryList[j]; - if (typeof glossary !== 'object' || glossary === null || Array.isArray(glossary)) { continue; } - glossaryList[j] = this._formatDictionaryTermGlossaryObject(glossary, entry, requirements); + const tMediaResolveStart = Date.now(); + const tResolveExisting = Date.now(); + if (alreadyAddedRequirements.length > 0) { + /** @type {import('dictionary-importer').ImportRequirement[]} */ + const unresolvedRequirements = []; + for (const requirement of alreadyAddedRequirements) { + if (!this._tryResolveRequirementFromCachedImageMetadata(requirement)) { + unresolvedRequirements.push(requirement); + } + } + if (unresolvedRequirements.length > 0) { + await this._resolveAsyncRequirements(unresolvedRequirements, fileMap); + } + } + const tResolveNew = Date.now(); + /** @type {import('dictionary-database').MediaDataArrayBufferContent[]} */ + let media = []; + if (notAddedRequirements.length > 0) { + ({media} = await this._resolveAsyncRequirements(notAddedRequirements, fileMap)); } + const tResolved = Date.now(); + mediaResolveMs += Math.max(0, tResolved - tMediaResolveStart); + step4TimingBreakdown.mediaResolveMs += mediaResolveMs; + this._logImport( + `term file ${termFile.filename}: resolve existing=${alreadyAddedRequirements.length} ` + + `${tResolveNew - tResolveExisting}ms new=${notAddedRequirements.length} ` + + `${tResolved - tResolveNew}ms`, + ); + this._logImport(`term file ${termFile.filename}: requirements=${requirements.length} newMedia=${media.length}`); + const tMediaWriteStart = Date.now(); + await bulkAdd('media', media, {trackProgress}); + const tMediaWriteEnd = Date.now(); + mediaWriteMs += Math.max(0, tMediaWriteEnd - tMediaWriteStart); + step4TimingBreakdown.mediaWriteMs += mediaWriteMs; + counts.media.total += media.length; + this._logImport(`term file ${termFile.filename}: media write rows=${media.length} elapsed=${tMediaWriteEnd - tMediaWriteStart}ms`); + + if (trackProgress) { this._progress(); } + media = []; } - const alreadyAddedRequirements = requirements.filter((x) => { return uniqueMediaPaths.has(x.source.path); }); - const notAddedRequirements = requirements.filter((x) => { return !uniqueMediaPaths.has(x.source.path); }); - for (const requirement of requirements) { uniqueMediaPaths.add(requirement.source.path); } - - await this._resolveAsyncRequirements(alreadyAddedRequirements, fileMap); // already added must also be resolved for the term dict to have correct data - let {media} = await this._resolveAsyncRequirements(notAddedRequirements, fileMap); - await bulkAdd('media', media); - counts.media.total += media.length; - - this._progress(); - - await bulkAdd('terms', termList); + if (useMediaPipeline) { + const tSerializationStart = Date.now(); + this._prepareTermImportSerialization(termList, enableTermEntryContentDedup); + serializationMs += Math.max(0, Date.now() - tSerializationStart); + step4TimingBreakdown.termSerializationMs += serializationMs; + } + const tTermsWriteStart = Date.now(); + await bulkAdd('terms', termList, {trackProgress}); + const tTermsWriteEnd = Date.now(); + bulkAddTermsMs += Math.max(0, tTermsWriteEnd - tTermsWriteStart); + const bulkAddTermsMetrics = dictionaryDatabase.getLastBulkAddTermsMetrics(); + if (bulkAddTermsMetrics !== null) { + ({ + contentAppendMs, + termRecordBuildMs, + termRecordEncodeMs, + termRecordWriteMs, + termsVtabInsertMs, + } = bulkAddTermsMetrics); + } + step4TimingBreakdown.bulkAddTermsMs += bulkAddTermsMs; counts.terms.total += termList.length; + this._logImport(`term file ${termFile.filename}: terms write rows=${termList.length} elapsed=${tTermsWriteEnd - tTermsWriteStart}ms`); - this._progress(); + if (trackProgress) { + this._progress(); + } else if (streamedProgress !== null) { + updateStreamedTermFileProgress(streamedProgressStartIndex, streamedProgress.processedRows, streamedProgress.totalRows); + } + return { + mediaResolveMs, + mediaWriteMs, + serializationMs, + bulkAddTermsMs, + contentAppendMs, + termRecordBuildMs, + termRecordEncodeMs, + termRecordWriteMs, + termsVtabInsertMs, + }; + }; + for (let termFileIndex = 0; termFileIndex < activeTermFiles.length; ++termFileIndex) { + const termFile = activeTermFiles[termFileIndex]; + const tTermFile = Date.now(); + const streamedProgressStartIndex = this._progressData.index; + let streamedImportCompleted = false; + let termParseAlreadyAccounted = false; + let streamChunkWorkMs = 0; + let artifactBulkAddTermsMs = 0; + let artifactSerializationMs = 0; + let artifactMediaResolveMs = 0; + let artifactMediaWriteMs = 0; + let artifactContentAppendMs = 0; + let artifactTermRecordBuildMs = 0; + let artifactTermRecordEncodeMs = 0; + let artifactTermRecordWriteMs = 0; + let artifactTermsVtabInsertMs = 0; + let artifactMetadataRebaseMs = 0; + const tFastParseStart = Date.now(); + if ((useTermArtifactFiles || usePackedTermArtifact) && /\.mbtb$/i.test(termFile.filename)) { + const termArtifactFileEntry = /** @type {import('@zip.js/zip.js').Entry|undefined} */ ( + termFile instanceof Object && 'getData' in termFile ? termFile : void 0 + ); + const tArtifactParseStart = Date.now(); + const packedTermBankMeta = termArtifactManifest?.termBanksByArtifact.get(termFile.filename) ?? null; + if (this._streamTermArtifactChunks) { + /** + * @param {import('dictionary-database').DatabaseTermEntry[]} termListChunk + * @param {import('dictionary-importer').ImportRequirement[]|null} requirementsChunk + * @param {{processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}} streamProgress + * @returns {Promise} + */ + const onArtifactChunk = async (termListChunk, requirementsChunk, streamProgress) => { + const tChunkWorkStart = Date.now(); + const chunkMetrics = await processTermChunk(termFile, termListChunk, requirementsChunk, streamProgress, streamedProgressStartIndex); + artifactBulkAddTermsMs += chunkMetrics.bulkAddTermsMs; + artifactSerializationMs += chunkMetrics.serializationMs; + artifactMediaResolveMs += chunkMetrics.mediaResolveMs; + artifactMediaWriteMs += chunkMetrics.mediaWriteMs; + artifactContentAppendMs += chunkMetrics.contentAppendMs; + step4ArtifactMetadataAppendMs += chunkMetrics.contentAppendMs; + artifactTermRecordBuildMs += chunkMetrics.termRecordBuildMs; + artifactTermRecordEncodeMs += chunkMetrics.termRecordEncodeMs; + artifactTermRecordWriteMs += chunkMetrics.termRecordWriteMs; + artifactTermsVtabInsertMs += chunkMetrics.termsVtabInsertMs; + streamChunkWorkMs += Math.max(0, Date.now() - tChunkWorkStart); + termListChunk.length = 0; + }; + if (packedTermArtifactBytes !== null && packedTermBankMeta !== null) { + const packedSlice = packedTermArtifactBytes.subarray( + packedTermBankMeta.packedOffset, + packedTermBankMeta.packedOffset + packedTermBankMeta.packedLength, + ); + await this._decodeTermBankArtifactBytes(packedSlice, termFile.filename, dictionaryTitle, prefixWildcardsSupported, effectiveTermContentStorageMode, onArtifactChunk, 0, sharedGlossaryArtifactBaseOffset); + } else if (preloadedTermArtifactBytes !== null) { + const preloadedBytes = preloadedTermArtifactBytes.get(termFile.filename); + if (typeof preloadedBytes === 'undefined') { + throw new Error(`Missing preloaded term artifact bytes for '${termFile.filename}'`); + } + await this._decodeTermBankArtifactBytes(preloadedBytes, termFile.filename, dictionaryTitle, prefixWildcardsSupported, effectiveTermContentStorageMode, onArtifactChunk, 0, sharedGlossaryArtifactBaseOffset); + } else { + if (typeof termArtifactFileEntry === 'undefined') { + throw new Error(`Missing zip entry for term artifact '${termFile.filename}'`); + } + await this._readTermBankArtifactFile( + termArtifactFileEntry, + dictionaryTitle, + prefixWildcardsSupported, + effectiveTermContentStorageMode, + onArtifactChunk, + sharedGlossaryArtifactBaseOffset, + ); + } + const totalArtifactReadMs = Math.max(0, Date.now() - tArtifactParseStart); + lastArtifactTermBankReadProfile = this._lastArtifactTermBankReadProfile ?? null; + if (lastArtifactTermBankReadProfile !== null) { + step4ArtifactReadBytesMs += Math.max(0, lastArtifactTermBankReadProfile.readBytesMs ?? 0); + step4ArtifactMetadataRebaseMs += Math.max(0, lastArtifactTermBankReadProfile.metadataRebaseMs ?? 0); + artifactMetadataRebaseMs += Math.max(0, lastArtifactTermBankReadProfile.metadataRebaseMs ?? 0); + } + step4TimingBreakdown.termParseMs += Math.max(0, totalArtifactReadMs - streamChunkWorkMs); + termParseAlreadyAccounted = true; + } else { + let termReadResult; + if (packedTermArtifactBytes !== null && packedTermBankMeta !== null) { + const packedSlice = packedTermArtifactBytes.subarray( + packedTermBankMeta.packedOffset, + packedTermBankMeta.packedOffset + packedTermBankMeta.packedLength, + ); + termReadResult = await this._decodeTermBankArtifactBytes(packedSlice, termFile.filename, dictionaryTitle, prefixWildcardsSupported, effectiveTermContentStorageMode, void 0, 0, sharedGlossaryArtifactBaseOffset); + } else if (preloadedTermArtifactBytes !== null) { + const preloadedBytes = preloadedTermArtifactBytes.get(termFile.filename); + if (typeof preloadedBytes === 'undefined') { + throw new Error(`Missing preloaded term artifact bytes for '${termFile.filename}'`); + } + termReadResult = await this._decodeTermBankArtifactBytes(preloadedBytes, termFile.filename, dictionaryTitle, prefixWildcardsSupported, effectiveTermContentStorageMode, void 0, 0, sharedGlossaryArtifactBaseOffset); + } else { + if (typeof termArtifactFileEntry === 'undefined') { + throw new Error(`Missing zip entry for term artifact '${termFile.filename}'`); + } + termReadResult = await this._readTermBankArtifactFile( + termArtifactFileEntry, + dictionaryTitle, + prefixWildcardsSupported, + effectiveTermContentStorageMode, + void 0, + sharedGlossaryArtifactBaseOffset, + ); + } + lastArtifactTermBankReadProfile = this._lastArtifactTermBankReadProfile ?? null; + if (lastArtifactTermBankReadProfile !== null) { + step4ArtifactReadBytesMs += Math.max(0, lastArtifactTermBankReadProfile.readBytesMs ?? 0); + step4ArtifactMetadataRebaseMs += Math.max(0, lastArtifactTermBankReadProfile.metadataRebaseMs ?? 0); + artifactMetadataRebaseMs += Math.max(0, lastArtifactTermBankReadProfile.metadataRebaseMs ?? 0); + } + step4TimingBreakdown.termParseMs += Math.max(0, Date.now() - tArtifactParseStart); + termParseAlreadyAccounted = true; + const chunkMetrics = await processTermChunk(termFile, termReadResult.termList, termReadResult.requirements); + artifactBulkAddTermsMs += chunkMetrics.bulkAddTermsMs; + artifactSerializationMs += chunkMetrics.serializationMs; + artifactMediaResolveMs += chunkMetrics.mediaResolveMs; + artifactMediaWriteMs += chunkMetrics.mediaWriteMs; + artifactContentAppendMs += chunkMetrics.contentAppendMs; + step4ArtifactMetadataAppendMs += chunkMetrics.contentAppendMs; + artifactTermRecordBuildMs += chunkMetrics.termRecordBuildMs; + artifactTermRecordEncodeMs += chunkMetrics.termRecordEncodeMs; + artifactTermRecordWriteMs += chunkMetrics.termRecordWriteMs; + artifactTermsVtabInsertMs += chunkMetrics.termsVtabInsertMs; + } + streamedImportCompleted = true; + } else if (!this._disableTermBankWasmFastPath) { + try { + const termFileEntry = /** @type {import('@zip.js/zip.js').Entry} */ (termFile); + await this._readTermBankFileFast( + termFileEntry, + version, + dictionaryTitle, + prefixWildcardsSupported, + useMediaPipeline, + enableTermEntryContentDedup, + termContentStorageMode, + async (termListChunk, requirementsChunk, streamProgress) => { + const tChunkWorkStart = Date.now(); + await processTermChunk(termFile, termListChunk, requirementsChunk, streamProgress, streamedProgressStartIndex); + streamChunkWorkMs += Math.max(0, Date.now() - tChunkWorkStart); + termListChunk.length = 0; + if (requirementsChunk !== null) { + requirementsChunk.length = 0; + } + }, + ); + lastFastTermBankReadProfile = this._lastFastTermBankReadProfile ?? null; + streamedImportCompleted = true; + } catch (error) { + const e = toError(error); + this._logImport(`term file ${termFile.filename}: streaming fast path failed (${e.message}), using fallback parser`); + } + } else { + this._logImport(`term file ${termFile.filename}: streaming wasm parser disabled by import flag`); + } + + if (streamedImportCompleted) { + if (!termParseAlreadyAccounted) { + const totalFastReadMs = Math.max(0, Date.now() - tFastParseStart); + step4TimingBreakdown.termParseMs += Math.max(0, totalFastReadMs - streamChunkWorkMs); + } + if (lastFastTermBankReadProfile !== null) { + const parserProfile = lastFastTermBankReadProfile.parserProfile ?? {}; + recordPhaseTiming(`term-file-fast-path:${termFile.filename}`, tFastParseStart, { + rows: lastFastTermBankReadProfile.totalRows ?? null, + chunkCount: lastFastTermBankReadProfile.chunkCount ?? null, + importerMaterializationMs: lastFastTermBankReadProfile.materializationMs ?? null, + importerChunkSinkMs: lastFastTermBankReadProfile.chunkSinkMs ?? null, + parserBufferSetupMs: parserProfile.bufferSetupMs ?? null, + parserAllocationMs: parserProfile.allocationMs ?? null, + parserCopyJsonMs: parserProfile.copyJsonMs ?? null, + parserParseBankMs: parserProfile.parseBankMs ?? null, + parserEncodeContentMs: parserProfile.encodeContentMs ?? null, + parserRowDecodeMs: parserProfile.rowDecodeMs ?? null, + parserChunkDispatchMs: parserProfile.chunkDispatchMs ?? null, + parserChunkSize: parserProfile.chunkSize ?? null, + parserMinimalDecode: parserProfile.minimalDecode ?? null, + }); + } else if (lastArtifactTermBankReadProfile !== null) { + const artifactReadBytesMs = lastArtifactTermBankReadProfile.readBytesMs ?? null; + recordPhaseTiming(`term-file-artifact-path:${termFile.filename}`, tFastParseStart, { + rows: lastArtifactTermBankReadProfile.totalRows ?? null, + chunkCount: lastArtifactTermBankReadProfile.chunkCount ?? null, + rowChunkSize: lastArtifactTermBankReadProfile.rowChunkSize ?? null, + artifactReadBytesMs, + artifactDecodeRowsMs: lastArtifactTermBankReadProfile.decodeRowsMs ?? null, + artifactReverseRowsMs: lastArtifactTermBankReadProfile.reverseRowsMs ?? null, + artifactBulkAddTermsMs, + artifactContentAppendMs, + artifactMetadataAppendMs: artifactContentAppendMs, + artifactTermRecordBuildMs, + artifactTermRecordEncodeMs, + artifactTermRecordWriteMs, + artifactTermsVtabInsertMs, + artifactMetadataRebaseMs, + artifactSerializationMs, + artifactMediaResolveMs, + artifactMediaWriteMs, + importerChunkSinkMs: lastArtifactTermBankReadProfile.chunkSinkMs ?? null, + }); + } + updateStreamedTermFileProgress(streamedProgressStartIndex, 1, 1, true); + } else { + const tTermParseStart = Date.now(); + const termFileEntry = /** @type {import('@zip.js/zip.js').Entry} */ (termFile); + const termReadResult = await this._readTermBankFile( + termFileEntry, + version, + dictionaryTitle, + prefixWildcardsSupported, + useMediaPipeline, + enableTermEntryContentDedup, + termContentStorageMode, + ); + step4TimingBreakdown.termParseMs += Math.max(0, Date.now() - tTermParseStart); + await processTermChunk(termFile, termReadResult.termList, termReadResult.requirements); + } - termList = []; - media = []; + const termFileElapsedMs = Math.max(0, Date.now() - tTermFile); + const termFileAccountedMs = ( + artifactBulkAddTermsMs + + artifactSerializationMs + + artifactMediaResolveMs + + artifactMediaWriteMs + + streamChunkWorkMs + ); + step4TimingBreakdown.termFileNonParseWriteMs += Math.max(0, termFileElapsedMs - termFileAccountedMs); + this._logImport(`term file ${termFile.filename}: total elapsed=${termFileElapsedMs}ms`); } for (const termMetaFile of termMetaFiles) { + const tTermMetaFile = Date.now(); let termMetaList = await this._readFileSequence([termMetaFile], this._convertTermMetaBankEntry.bind(this), dictionaryTitle); + step4TimingBreakdown.termMetaReadMs += Math.max(0, Date.now() - tTermMetaFile); + const tMetaWriteStart = Date.now(); await bulkAdd('termMeta', termMetaList); + step4TimingBreakdown.bulkAddTagsMetaMs += Math.max(0, Date.now() - tMetaWriteStart); for (const [key, value] of Object.entries(this._getMetaCounts(termMetaList))) { if (key in counts.termMeta) { counts.termMeta[key] += value; @@ -238,29 +1017,39 @@ export class DictionaryImporter { } this._progress(); + this._logImport(`termMeta file ${termMetaFile.filename}: entries=${termMetaList.length} elapsed=${Date.now() - tTermMetaFile}ms`); termMetaList = []; } for (const kanjiFile of kanjiFiles) { + const tKanjiFile = Date.now(); let kanjiList = await ( - version === 1 ? - this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV1.bind(this), dictionaryTitle) : - this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV3.bind(this), dictionaryTitle) + version === 1 ? + this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV1.bind(this), dictionaryTitle) : + this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV3.bind(this), dictionaryTitle) ); + step4TimingBreakdown.kanjiReadMs += Math.max(0, Date.now() - tKanjiFile); + const tKanjiWriteStart = Date.now(); await bulkAdd('kanji', kanjiList); + step4TimingBreakdown.bulkAddTagsMetaMs += Math.max(0, Date.now() - tKanjiWriteStart); counts.kanji.total += kanjiList.length; this._progress(); + this._logImport(`kanji file ${kanjiFile.filename}: entries=${kanjiList.length} elapsed=${Date.now() - tKanjiFile}ms`); kanjiList = []; } for (const kanjiMetaFile of kanjiMetaFiles) { + const tKanjiMetaFile = Date.now(); let kanjiMetaList = await this._readFileSequence([kanjiMetaFile], this._convertKanjiMetaBankEntry.bind(this), dictionaryTitle); + step4TimingBreakdown.kanjiMetaReadMs += Math.max(0, Date.now() - tKanjiMetaFile); + const tKanjiMetaWriteStart = Date.now(); await bulkAdd('kanjiMeta', kanjiMetaList); + step4TimingBreakdown.bulkAddTagsMetaMs += Math.max(0, Date.now() - tKanjiMetaWriteStart); for (const [key, value] of Object.entries(this._getMetaCounts(kanjiMetaList))) { if (key in counts.kanjiMeta) { counts.kanjiMeta[key] += value; @@ -270,52 +1059,143 @@ export class DictionaryImporter { } this._progress(); + this._logImport(`kanjiMeta file ${kanjiMetaFile.filename}: entries=${kanjiMetaList.length} elapsed=${Date.now() - tKanjiMetaFile}ms`); kanjiMetaList = []; } for (const tagFile of tagFiles) { + const tTagFile = Date.now(); let tagList = await this._readFileSequence([tagFile], this._convertTagBankEntry.bind(this), dictionaryTitle); this._addOldIndexTags(index, tagList, dictionaryTitle); + step4TimingBreakdown.tagReadMs += Math.max(0, Date.now() - tTagFile); + const tTagWriteStart = Date.now(); await bulkAdd('tagMeta', tagList); + step4TimingBreakdown.bulkAddTagsMetaMs += Math.max(0, Date.now() - tTagWriteStart); counts.tagMeta.total += tagList.length; this._progress(); + this._logImport(`tag file ${tagFile.filename}: entries=${tagList.length} elapsed=${Date.now() - tTagFile}ms`); tagList = []; } - - importSuccess = true; + const importDataBanksElapsedMs = Math.max(0, Date.now() - tImportBanksStart); + const step4AccountedMs = ( + step4TimingBreakdown.termParseMs + + step4TimingBreakdown.termSerializationMs + + step4TimingBreakdown.bulkAddTermsMs + + step4TimingBreakdown.bulkAddTagsMetaMs + + step4TimingBreakdown.mediaResolveMs + + step4TimingBreakdown.mediaWriteMs + ); + recordPhaseTiming('import-data-banks', tImportBanksStart, { + terms: counts.terms.total, + termMeta: counts.termMeta.total, + kanji: counts.kanji.total, + kanjiMeta: counts.kanjiMeta.total, + tagMeta: counts.tagMeta.total, + media: counts.media.total, + step4TermParseMs: Math.max(0, step4TimingBreakdown.termParseMs), + step4TermSerializationMs: Math.max(0, step4TimingBreakdown.termSerializationMs), + step4BulkAddTermsMs: Math.max(0, step4TimingBreakdown.bulkAddTermsMs), + step4BulkAddTagsMetaMs: Math.max(0, step4TimingBreakdown.bulkAddTagsMetaMs), + step4MediaResolveMs: Math.max(0, step4TimingBreakdown.mediaResolveMs), + step4MediaWriteMs: Math.max(0, step4TimingBreakdown.mediaWriteMs), + step4TermFileNonParseWriteMs: Math.max(0, step4TimingBreakdown.termFileNonParseWriteMs), + step4ArtifactPreloadMs: Math.max(0, step4ArtifactPreloadMs), + step4ArtifactReadBytesMs: Math.max(0, step4ArtifactReadBytesMs), + step4ArtifactMetadataRebaseMs: Math.max(0, step4ArtifactMetadataRebaseMs), + step4ArtifactMetadataAppendMs: Math.max(0, step4ArtifactMetadataAppendMs), + step4SharedGlossaryAppendMs: Math.max(0, step4SharedGlossaryAppendMs), + step4TermMetaReadMs: Math.max(0, step4TimingBreakdown.termMetaReadMs), + step4KanjiReadMs: Math.max(0, step4TimingBreakdown.kanjiReadMs), + step4KanjiMetaReadMs: Math.max(0, step4TimingBreakdown.kanjiMetaReadMs), + step4TagReadMs: Math.max(0, step4TimingBreakdown.tagReadMs), + step4AccountedMs: Math.max(0, step4AccountedMs), + step4OtherMs: Math.max(0, importDataBanksElapsedMs - step4AccountedMs), + useMediaPipeline, + hasArchiveImageMediaFiles, + }); + + // Finalize dictionary descriptor + this._progressNextStep(0); + const tFinalizeDescriptorStart = Date.now(); + + const stylesFileName = 'styles.css'; + const stylesFile = fileMap.get(stylesFileName); + if (typeof stylesFile !== 'undefined') { + styles = await this._getData(stylesFile, new TextWriter()); + const cssErrors = this._validateCss(styles); + if (cssErrors.length > 0) { + throw cssErrors[0]; + } + } + recordPhaseTiming('finalize-descriptor', tFinalizeDescriptorStart, { + hasStyles: styles.length > 0, + }); } catch (e) { + importFailed = true; errors.push(toError(e)); + recordPhaseTiming('import-data-banks', tImportBanksStart, { + ok: false, + }); + } finally { + this._setProgressInterval(previousProgressInterval); + const tBulkFinalizationStart = Date.now(); + /** @type {{commitMs?: number, termContentEndImportSessionMs?: number, termRecordEndImportSessionMs?: number, termsVirtualTableSyncMs?: number, createIndexesMs?: number, createIndexesCheckpointCount?: number, cacheResetMs?: number, runtimePragmasMs?: number, totalMs?: number}|null} */ + let bulkFinalizationDetails = null; + this._progressNextStep(20, false); + this._progressData.index = 0; + this._progress(); + try { + bulkFinalizationDetails = await dictionaryDatabase.finishBulkImport((checkpointIndex, total) => { + this._progressData.index = Math.max(1, Math.floor((checkpointIndex / total) * this._progressData.count)); + this._progress(); + this._logImport(`bulk finalization ${checkpointIndex}/${total}`); + }); + } catch (e) { + importFailed = true; + errors.push(toError(e)); + } + this._progressData.index = this._progressData.count; + this._progress(); + const bulkFinalizationPhaseDetails = {ok: !importFailed}; + if (bulkFinalizationDetails !== null) { + Object.assign(bulkFinalizationPhaseDetails, bulkFinalizationDetails); + } + recordPhaseTiming('bulk-finalization', tBulkFinalizationStart, bulkFinalizationPhaseDetails); + dictionaryDatabase.setImportDebugLogging(false); } - // Update dictionary descriptor - this._progressNextStep(0); - - const stylesFileName = 'styles.css'; - const stylesFile = fileMap.get(stylesFileName); - let styles = ''; - if (typeof stylesFile !== 'undefined') { - styles = await this._getData(stylesFile, new TextWriter()); - const cssErrors = this._validateCss(styles); - if (cssErrors.length > 0) { - return { - errors: cssErrors, - result: null, - }; + if (importFailed) { + try { + await dictionaryDatabase.deleteDictionary(dictionaryTitle, 1000, () => {}); + } catch (e) { + const cleanupError = toError(e); + errors.push(new Error(`Failed to clean up partially imported dictionary ${dictionaryTitle}: ${cleanupError.message}`)); } + return { + result: null, + errors, + debug: {phaseTimings}, + }; } + importSuccess = true; summaryDetails = {prefixWildcardsSupported, counts, styles, yomitanVersion, importSuccess}; summary = this._createSummary(dictionaryTitle, version, index, summaryDetails); - const primaryKey = await dictionarySummaryResult; - await dictionaryDatabase.bulkUpdate('dictionaries', [{data: summary, primaryKey}], 0, 1); + const tSummaryUpdateStart = Date.now(); + await dictionaryDatabase.bulkUpdate('dictionaries', [{data: summary, primaryKey: dictionarySummaryPrimaryKey}], 0, 1); + recordPhaseTiming('write-summary', tSummaryUpdateStart, {ok: true}); + this._logImport(`import done ${Date.now() - tImportStart}ms terms=${counts.terms.total} media=${counts.media.total}`); this._progress(); - - return {result: summary, errors}; + return { + result: summary, + errors, + debug: {phaseTimings}, + }; } /** @@ -368,11 +1248,6 @@ export class DictionaryImporter { const indexContent = await this._getData(indexFile2, new TextWriter()); const index = /** @type {unknown} */ (parseJson(indexContent)); - - if (!ajvSchemas.dictionaryIndex(index)) { - throw this._formatAjvSchemaError(ajvSchemas.dictionaryIndex, INDEX_FILE_NAME); - } - const validIndex = /** @type {import('dictionary-data').Index} */ (index); const version = typeof validIndex.format === 'number' ? validIndex.format : validIndex.version; @@ -382,10 +1257,26 @@ export class DictionaryImporter { if (typeof version !== 'number' || !title || !revision) { throw new Error('Unrecognized dictionary format'); } + if (!SUPPORTED_INDEX_VERSIONS.has(version)) { + throw new Error(`Unsupported dictionary format version: ${String(version)}`); + } return validIndex; } + /** + * @param {import('dictionary-importer').ArchiveFileMap} fileMap + * @returns {boolean} + */ + _archiveHasImageMediaFiles(fileMap) { + for (const fileName of fileMap.keys()) { + if (getImageMediaTypeFromFileName(fileName) !== null) { + return true; + } + } + return false; + } + /** * @returns {import('dictionary-importer').ProgressData} */ @@ -399,25 +1290,51 @@ export class DictionaryImporter { /** */ _progressReset() { this._progressData = this._createProgressData(); + this._lastProgressTimestamp = 0; this._progress(true); } /** * @param {number} count + * @param {boolean} [advanceStep] */ - _progressNextStep(count) { + _progressNextStep(count, advanceStep = true) { this._progressData.index = 0; this._progressData.count = count; - this._progress(true); + this._progress(advanceStep); + } + + /** + * @param {number} intervalMs + */ + _setProgressInterval(intervalMs) { + const normalizedIntervalMs = Number.isFinite(intervalMs) ? Math.max(50, Math.trunc(intervalMs)) : this._progressMinIntervalMs; + this._progressMinIntervalMs = normalizedIntervalMs; } /** * @param {boolean} nextStep */ _progress(nextStep = false) { + const now = Date.now(); + if (!nextStep && (now - this._lastProgressTimestamp) < this._progressMinIntervalMs) { + return; + } + this._lastProgressTimestamp = now; this._onProgress({...this._progressData, nextStep}); } + /** + * @param {string} message + */ + _logImport(message) { + if (!this._debugImportLogging) { + return; + } + // eslint-disable-next-line no-console + console.log(`[yomitan-import] ${message}`); + } + /** * @param {string} dictionaryTitle * @param {number} version @@ -489,39 +1406,6 @@ export class DictionaryImporter { return url.protocol === 'http:' || url.protocol === 'https:'; } - /** - * @param {import('ajv').ValidateFunction} schema - * @param {string} fileName - * @returns {ExtensionError} - */ - _formatAjvSchemaError(schema, fileName) { - const e = new ExtensionError(`Dictionary has invalid data in '${fileName}' '${JSON.stringify(schema.errors)}'`); - e.data = schema.errors; - return e; - } - - /** - * @param {number} version - * @returns {import('dictionary-importer').CompiledSchemaNameArray} - */ - _getDataBankSchemas(version) { - const termBank = ( - version === 1 ? - 'dictionaryTermBankV1' : - 'dictionaryTermBankV3' - ); - const termMetaBank = 'dictionaryTermMetaBankV3'; - const kanjiBank = ( - version === 1 ? - 'dictionaryKanjiBankV1' : - 'dictionaryKanjiBankV3' - ); - const kanjiMetaBank = 'dictionaryKanjiMetaBankV3'; - const tagBank = 'dictionaryTagBankV3'; - - return [termBank, termMetaBank, kanjiBank, kanjiMetaBank, tagBank]; - } - /** * @param {string} css * @returns {Error[]} @@ -557,6 +1441,12 @@ export class DictionaryImporter { * @returns {import('dictionary-data').TermGlossaryImage} */ _formatDictionaryTermGlossaryImage(data, entry, requirements) { + if (this._skipMediaImport) { + return { + ...data, + type: 'image', + }; + } /** @type {import('dictionary-data').TermGlossaryImage} */ const target = { type: 'image', @@ -573,6 +1463,9 @@ export class DictionaryImporter { * @returns {import('dictionary-data').TermGlossaryStructuredContent} */ _formatStructuredContent(data, entry, requirements) { + if (this._skipMediaImport) { + return data; + } const content = this._prepareStructuredContent(data.content, entry, requirements); return { type: 'structured-content', @@ -615,6 +1508,9 @@ export class DictionaryImporter { * @returns {import('structured-content').ImageElement} */ _prepareStructuredContentImage(content, entry, requirements) { + if (this._skipMediaImport) { + return {...content}; + } /** @type {import('structured-content').ImageElement} */ const target = { tag: 'img', @@ -635,9 +1531,9 @@ export class DictionaryImporter { /** @type {import('dictionary-importer').ImportRequirementContext} */ const context = {fileMap, media}; - for (const requirement of requirements) { + await this._runWithConcurrencyLimit(requirements, this._mediaResolutionConcurrency, async (requirement) => { await this._resolveAsyncRequirement(context, requirement); - } + }); return { media: [...media.values()], @@ -671,6 +1567,33 @@ export class DictionaryImporter { } } + /** + * @param {import('dictionary-importer').ImportRequirement} requirement + * @returns {boolean} + */ + _tryResolveRequirementFromCachedImageMetadata(requirement) { + const sourcePath = requirement.source.path; + const cachedMetadata = this._imageMetadataByPath.get(sourcePath); + if (typeof cachedMetadata === 'undefined') { + return false; + } + + switch (requirement.type) { + case 'image': + this._assignResolvedImageData(requirement.target, requirement.source, cachedMetadata.width, cachedMetadata.height); + return true; + case 'structured-content-image': + this._assignResolvedImageData(requirement.target, requirement.source, cachedMetadata.width, cachedMetadata.height); + if (typeof requirement.source.verticalAlign === 'string') { requirement.target.verticalAlign = requirement.source.verticalAlign; } + if (typeof requirement.source.border === 'string') { requirement.target.border = requirement.source.border; } + if (typeof requirement.source.borderRadius === 'string') { requirement.target.borderRadius = requirement.source.borderRadius; } + if (typeof requirement.source.sizeUnits === 'string') { requirement.target.sizeUnits = requirement.source.sizeUnits; } + return true; + default: + return false; + } + } + /** * @param {import('dictionary-importer').ImportRequirementContext} context * @param {import('dictionary-data').TermGlossaryImage} target @@ -708,6 +1631,20 @@ export class DictionaryImporter { * @param {import('dictionary-database').DatabaseTermEntry} entry */ async _createImageData(context, target, source, entry) { + const { + path, + } = source; + const {width, height} = await this._getImageMedia(context, path, entry); + this._assignResolvedImageData(target, source, width, height); + } + + /** + * @param {import('structured-content').ImageElementBase} target + * @param {import('structured-content').ImageElementBase} source + * @param {number} width + * @param {number} height + */ + _assignResolvedImageData(target, source, width, height) { const { path, width: preferredWidth, @@ -722,7 +1659,6 @@ export class DictionaryImporter { collapsed, collapsible, } = source; - const {width, height} = await this._getImageMedia(context, path, entry); target.path = path; target.width = width; target.height = height; @@ -760,49 +1696,101 @@ export class DictionaryImporter { }; // Check if already added - let mediaData = media.get(path); + const mediaData = media.get(path); if (typeof mediaData !== 'undefined') { if (getFileExtensionFromImageMediaType(mediaData.mediaType) === null) { throw createError('Media file is not a valid image'); } return mediaData; } - - // Find file in archive - const file = context.fileMap.get(path); - if (typeof file === 'undefined') { - throw createError('Could not find image'); + const pending = this._pendingImageMediaByPath.get(path); + if (typeof pending !== 'undefined') { + return await pending; + } + const cachedMetadata = this._imageMetadataByPath.get(path); + if (typeof cachedMetadata !== 'undefined') { + return { + dictionary, + path, + mediaType: cachedMetadata.mediaType, + width: cachedMetadata.width, + height: cachedMetadata.height, + content: new ArrayBuffer(0), + }; } + const promise = (async () => { + // Find file in archive + const file = context.fileMap.get(path); + if (typeof file === 'undefined') { + throw createError('Could not find image'); + } - // Load file content - let content = await (await this._getData(file, new BlobWriter())).arrayBuffer(); + // Load file content + let content = await (await this._getData(file, new BlobWriter())).arrayBuffer(); - const mediaType = getImageMediaTypeFromFileName(path); - if (mediaType === null) { - throw createError('Could not determine media type for image'); - } + const mediaType = getImageMediaTypeFromFileName(path); + if (mediaType === null) { + throw createError('Could not determine media type for image'); + } + + let width = 0; + let height = 0; + if (!this._skipImageMetadata) { + // Decode image only when metadata extraction is explicitly enabled. + try { + ({content, width, height} = await this._mediaLoader.getImageDetails(content, mediaType)); + } catch (e) { + throw createError('Could not load image'); + } + } - // Load image data - let width; - let height; + const created = { + dictionary, + path, + mediaType, + width, + height, + content, + }; + this._imageMetadataByPath.set(path, {mediaType, width, height}); + media.set(path, created); + return created; + })(); + this._pendingImageMediaByPath.set(path, promise); try { - ({content, width, height} = await this._mediaLoader.getImageDetails(content, mediaType)); - } catch (e) { - throw createError('Could not load image'); + return await promise; + } finally { + this._pendingImageMediaByPath.delete(path); } + } - // Create image data - mediaData = { - dictionary, - path, - mediaType, - width, - height, - content, - }; - media.set(path, mediaData); - - return mediaData; + /** + * @template T + * @param {T[]} items + * @param {number} concurrency + * @param {(item: T) => Promise} fn + * @returns {Promise} + */ + async _runWithConcurrencyLimit(items, concurrency, fn) { + if (items.length === 0) { + return; + } + let nextIndex = 0; + const workerCount = Math.min(concurrency, items.length); + /** @type {Promise[]} */ + const workers = []; + for (let i = 0; i < workerCount; ++i) { + workers.push((async () => { + while (true) { + const index = nextIndex++; + if (index >= items.length) { + return; + } + await fn(items[index]); + } + })()); + } + await Promise.all(workers); } /** @@ -827,6 +1815,153 @@ export class DictionaryImporter { return {expression, reading, definitionTags, rules, score, glossary, sequence, termTags, dictionary}; } + /** + * @param {import('dictionary-database').DatabaseTermEntry[]} termList + * @param {boolean} enableTermEntryContentDedup + */ + _prepareTermImportSerialization(termList, enableTermEntryContentDedup) { + for (const entry of termList) { + this._prepareTermEntrySerialization(entry, enableTermEntryContentDedup); + } + } + + /** + * @param {import('dictionary-database').DatabaseTermEntry} entry + * @param {boolean} enableTermEntryContentDedup + * @param {Uint8Array|null} [glossaryJsonBytes] + */ + _prepareTermEntrySerialization(entry, enableTermEntryContentDedup, glossaryJsonBytes = null) { + if ( + enableTermEntryContentDedup && + hasPrecomputedTermEntryContent(entry) + ) { + return; + } + if ( + hasPrecomputedTermEntryContent(entry) && + typeof entry.glossaryJson === 'string' + ) { + return; + } + const glossaryJson = (typeof entry.glossaryJson === 'string') ? entry.glossaryJson : JSON.stringify(entry.glossary); + if (!enableTermEntryContentDedup) { + entry.glossaryJson = glossaryJson; + } + const definitionTags = entry.definitionTags ?? entry.tags ?? ''; + const termTags = entry.termTags ?? ''; + let hash1; + let hash2; + if ( + glossaryJsonBytes instanceof Uint8Array && + glossaryJsonBytes.byteLength > 0 + ) { + const contentBytes = encodeRawTermContentBinary(entry.rules, definitionTags, termTags, glossaryJsonBytes, this._textEncoder); + [hash1, hash2] = this._hashEntryContentBytesPair(contentBytes); + entry.termEntryContentBytes = contentBytes; + entry.termEntryContentRawGlossaryJsonBytes = void 0; + } else { + const contentBytes = this._textEncoder.encode(this._createTermEntryContentJson(entry.rules, definitionTags, termTags, glossaryJson)); + [hash1, hash2] = this._hashEntryContentBytesPair(contentBytes); + entry.termEntryContentBytes = contentBytes; + entry.termEntryContentRawGlossaryJsonBytes = void 0; + } + entry.termEntryContentHash1 = hash1; + entry.termEntryContentHash2 = hash2; + entry.termEntryContentHash = hashPairToHex(hash1, hash2); + } + + /** + * @param {string} rules + * @param {string} definitionTags + * @param {string} termTags + * @param {string} glossaryJson + * @returns {string} + */ + _createTermEntryContentJson(rules, definitionTags, termTags, glossaryJson) { + return `{"rules":${this._quoteJsonStringCached(rules)},"definitionTags":${this._quoteJsonStringCached(definitionTags)},"termTags":${this._quoteJsonStringCached(termTags)},"glossary":${glossaryJson}}`; + } + + /** + * @param {string} value + * @returns {string} + */ + _quoteJsonStringCached(value) { + const cached = this._jsonQuotedStringCache.get(value); + if (typeof cached !== 'undefined') { + // Promote to keep eviction order LRU-like. + this._jsonQuotedStringCache.delete(value); + this._jsonQuotedStringCache.set(value, cached); + return cached; + } + const quoted = JSON.stringify(value); + if (this._jsonQuotedStringCache.size >= JSON_QUOTED_STRING_CACHE_MAX_ENTRIES) { + const oldestKey = this._jsonQuotedStringCache.keys().next().value; + if (typeof oldestKey === 'string') { + this._jsonQuotedStringCache.delete(oldestKey); + } + } + this._jsonQuotedStringCache.set(value, quoted); + return quoted; + } + + /** + * @param {string} value + * @returns {Uint8Array} + */ + _getUtf8StringBytesCached(value) { + const cached = this._utf8StringBytesCache.get(value); + if (cached instanceof Uint8Array) { + this._utf8StringBytesCache.delete(value); + this._utf8StringBytesCache.set(value, cached); + return cached; + } + const bytes = this._textEncoder.encode(value); + if (this._utf8StringBytesCache.size >= JSON_QUOTED_STRING_CACHE_MAX_ENTRIES) { + const oldestKey = this._utf8StringBytesCache.keys().next().value; + if (typeof oldestKey === 'string') { + this._utf8StringBytesCache.delete(oldestKey); + } + } + this._utf8StringBytesCache.set(value, bytes); + return bytes; + } + + /** + * @param {string} contentJson + * @returns {string} + */ + _hashEntryContent(contentJson) { + const [h1, h2] = this._hashEntryContentPair(contentJson); + return hashPairToHex(h1, h2); + } + + /** + * @param {string} contentJson + * @returns {[number, number]} + */ + _hashEntryContentPair(contentJson) { + return this._hashEntryContentBytesPair(this._textEncoder.encode(contentJson)); + } + + /** + * @param {Uint8Array} bytes + * @returns {[number, number]} + */ + _hashEntryContentBytesPair(bytes) { + let h1 = 0x811c9dc5; + let h2 = 0x9e3779b9; + for (let i = 0, ii = bytes.length; i < ii; ++i) { + const code = bytes[i]; + h1 = Math.imul((h1 ^ code) >>> 0, 0x01000193); + h2 = Math.imul((h2 ^ code) >>> 0, 0x85ebca6b); + h2 = (h2 ^ (h2 >>> 13)) >>> 0; + } + if ((h1 | h2) === 0) { + h1 = 1; + } + return [h1 >>> 0, h2 >>> 0]; + } + /** * @param {import('dictionary-data').TermMeta} entry * @param {string} dictionary @@ -918,6 +2053,130 @@ export class DictionaryImporter { return results; } + /** + * @param {import('dictionary-importer').ArchiveFileMap} fileMap + * @returns {Promise<{termBanksByArtifact: Map, packedFileName: string|null, sharedGlossaryFileName: string|null, sharedGlossaryPackedOffset: number|null, sharedGlossaryPackedLength: number|null, sharedGlossaryCompression: string|null, sharedGlossaryUncompressedLength: number|null, termContentMode: string|null}|null>} + */ + async _readTermArtifactManifest(fileMap) { + const manifestEntry = fileMap.get(TERM_BANK_ARTIFACT_MANIFEST_FILE); + if (typeof manifestEntry === 'undefined') { + return null; + } + let manifest; + try { + manifest = /** @type {{termBanks?: Array<{artifact?: unknown, packedOffset?: unknown, packedLength?: unknown, rows?: unknown}>, packedTermArtifact?: {file?: unknown}|null, sharedGlossaryArtifact?: {file?: unknown, packedOffset?: unknown, packedLength?: unknown, compression?: unknown, uncompressedBytes?: unknown}|null, termContentMode?: unknown}|null} */ ( + parseJson(await this._getData(/** @type {import('@zip.js/zip.js').Entry} */ (manifestEntry), new TextWriter())) + ); + } catch (_) { + return null; + } + if (!(typeof manifest === 'object' && manifest !== null)) { + return null; + } + /** @type {Map} */ + const termBanksByArtifact = new Map(); + const termBanks = Array.isArray(manifest.termBanks) ? manifest.termBanks : []; + for (const termBank of termBanks) { + if (!(typeof termBank === 'object' && termBank !== null)) { continue; } + const artifact = typeof termBank.artifact === 'string' ? termBank.artifact : null; + const packedOffset = Number.isInteger(termBank.packedOffset) ? /** @type {number} */ (termBank.packedOffset) : -1; + const packedLength = Number.isInteger(termBank.packedLength) ? /** @type {number} */ (termBank.packedLength) : -1; + const rows = Number.isInteger(termBank.rows) ? /** @type {number} */ (termBank.rows) : null; + if (artifact === null || packedOffset < 0 || packedLength <= 0) { continue; } + termBanksByArtifact.set(artifact, {packedOffset, packedLength, rows}); + } + const packedFileName = ( + typeof manifest.packedTermArtifact === 'object' && + manifest.packedTermArtifact !== null && + typeof manifest.packedTermArtifact.file === 'string' + ) ? + manifest.packedTermArtifact.file : + null; + const sharedGlossaryFileName = ( + typeof manifest.sharedGlossaryArtifact === 'object' && + manifest.sharedGlossaryArtifact !== null && + typeof manifest.sharedGlossaryArtifact.file === 'string' + ) ? + manifest.sharedGlossaryArtifact.file : + null; + const sharedGlossaryPackedOffset = ( + typeof manifest.sharedGlossaryArtifact === 'object' && + manifest.sharedGlossaryArtifact !== null && + Number.isInteger(manifest.sharedGlossaryArtifact.packedOffset) + ) ? + /** @type {number} */ (manifest.sharedGlossaryArtifact.packedOffset) : + null; + const sharedGlossaryPackedLength = ( + typeof manifest.sharedGlossaryArtifact === 'object' && + manifest.sharedGlossaryArtifact !== null && + Number.isInteger(manifest.sharedGlossaryArtifact.packedLength) + ) ? + /** @type {number} */ (manifest.sharedGlossaryArtifact.packedLength) : + null; + const sharedGlossaryCompression = ( + typeof manifest.sharedGlossaryArtifact === 'object' && + manifest.sharedGlossaryArtifact !== null && + typeof manifest.sharedGlossaryArtifact.compression === 'string' + ) ? + manifest.sharedGlossaryArtifact.compression : + null; + const sharedGlossaryUncompressedLength = ( + typeof manifest.sharedGlossaryArtifact === 'object' && + manifest.sharedGlossaryArtifact !== null && + Number.isInteger(manifest.sharedGlossaryArtifact.uncompressedBytes) + ) ? + /** @type {number} */ (manifest.sharedGlossaryArtifact.uncompressedBytes) : + null; + const termContentModeValue = manifest.termContentMode; + const termContentMode = typeof termContentModeValue === 'string' ? termContentModeValue : null; + return { + termBanksByArtifact, + packedFileName, + sharedGlossaryFileName, + sharedGlossaryPackedOffset, + sharedGlossaryPackedLength, + sharedGlossaryCompression, + sharedGlossaryUncompressedLength, + termContentMode, + }; + } + + /** + * @param {import('@zip.js/zip.js').Entry[]} termArtifactFiles + * @returns {Promise>} + */ + async _preloadTermArtifactFiles(termArtifactFiles) { + /** @type {Map} */ + const results = new Map(); + for (let i = 0; i < termArtifactFiles.length; i += TERM_ARTIFACT_PRELOAD_CONCURRENCY) { + const batch = termArtifactFiles.slice(i, i + TERM_ARTIFACT_PRELOAD_CONCURRENCY); + const batchResults = await Promise.all(batch.map(async (termFile) => { + const bytes = await this._getData(termFile, new Uint8ArrayWriter()); + return [termFile.filename, bytes]; + })); + for (const [filename, bytes] of batchResults) { + results.set(/** @type {string} */ (filename), /** @type {Uint8Array} */ (bytes)); + } + } + return results; + } + + /** + * @param {Map} termBanksByArtifact + * @returns {{filename: string}[]} + */ + _createPackedTermArtifactFiles(termBanksByArtifact) { + return [...termBanksByArtifact.keys()] + .sort((a, b) => { + const aMatch = /term_bank_(\d+)\.mbtb$/i.exec(a); + const bMatch = /term_bank_(\d+)\.mbtb$/i.exec(b); + const aIndex = aMatch !== null ? Number.parseInt(aMatch[1], 10) : Number.MAX_SAFE_INTEGER; + const bIndex = bMatch !== null ? Number.parseInt(bMatch[1], 10) : Number.MAX_SAFE_INTEGER; + return aIndex - bIndex; + }) + .map((filename) => ({filename})); + } + /** * @template [TEntry=unknown] * @template [TResult=unknown] @@ -951,32 +2210,698 @@ export class DictionaryImporter { } /** - * @param {import('@zip.js/zip.js').Entry} file - * @param {import('dictionary-importer').CompiledSchemaName} schemaName - * @returns {Promise} + * @param {import('@zip.js/zip.js').Entry} termFile + * @param {import('dictionary-data').IndexVersion} version + * @param {string} dictionaryTitle + * @param {boolean} prefixWildcardsSupported + * @param {boolean} useMediaPipeline + * @param {boolean} enableTermEntryContentDedup + * @param {'baseline'|'raw-bytes'} termContentStorageMode + * @returns {Promise<{termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null}>} */ - async _validateFile(file, schemaName) { - const content = await this._getData(file, new TextWriter()); - let entries; - + async _readTermBankFile( + termFile, + version, + dictionaryTitle, + prefixWildcardsSupported, + useMediaPipeline, + enableTermEntryContentDedup, + termContentStorageMode, + ) { + if (!this._disableTermBankWasmFastPath) { + try { + return await this._readTermBankFileFast( + termFile, + version, + dictionaryTitle, + prefixWildcardsSupported, + useMediaPipeline, + enableTermEntryContentDedup, + termContentStorageMode, + ); + } catch (e) { + this._logImport(`term file ${termFile.filename}: wasm parse fallback (${/** @type {Error} */ (toError(e)).message})`); + } + } else { + this._logImport(`term file ${termFile.filename}: wasm parser disabled by import flag`); + } + const content = await this._getData(termFile, new TextWriter()); + let entries = /** @type {unknown} */ ([]); try { - /** @type {unknown} */ entries = parseJson(content); } catch (error) { if (error instanceof Error) { - throw new Error(error.message + ` in '${file.filename}'`); + throw new Error(error.message + ` in '${termFile.filename}'`); } } + if (!Array.isArray(entries)) { + return {termList: [], requirements: null}; + } + const parsedEntries = /** @type {unknown[]} */ (entries); + /** @type {import('dictionary-importer').ImportRequirement[]|null} */ + const requirements = useMediaPipeline ? [] : null; + + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const result = []; + result.length = parsedEntries.length; + + for (let i = 0, ii = parsedEntries.length; i < ii; ++i) { + const raw = parsedEntries[i]; + const entry = version === 1 ? this._convertTermBankEntryV1(/** @type {import('dictionary-data').TermV1} */ (raw), dictionaryTitle) : this._convertTermBankEntryV3(/** @type {import('dictionary-data').TermV3} */ (raw), dictionaryTitle); + + this._assignPrefixReverseFields(entry, prefixWildcardsSupported); + + if (requirements !== null) { + const glossaryList = entry.glossary; + for (let j = 0, jj = glossaryList.length; j < jj; ++j) { + const glossary = glossaryList[j]; + if (typeof glossary !== 'object' || glossary === null || Array.isArray(glossary)) { continue; } + glossaryList[j] = this._formatDictionaryTermGlossaryObject(glossary, entry, requirements); + } + } - const schema = ajvSchemas[schemaName]; - if (!schema(entries)) { - throw this._formatAjvSchemaError(schema, file.filename); + if (requirements === null) { + this._prepareTermEntrySerialization(entry, enableTermEntryContentDedup); + } + result[i] = entry; } + return {termList: result, requirements}; + } - ++this._progressData.index; - this._progress(); + /** + * @param {import('@zip.js/zip.js').Entry} termFile + * @param {import('dictionary-data').IndexVersion} version + * @param {string} dictionaryTitle + * @param {boolean} prefixWildcardsSupported + * @param {boolean} useMediaPipeline + * @param {boolean} enableTermEntryContentDedup + * @param {'baseline'|'raw-bytes'} termContentStorageMode + * @param {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} [onChunk] + * @returns {Promise<{termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null}>} + */ + async _readTermBankFileFast(termFile, version, dictionaryTitle, prefixWildcardsSupported, useMediaPipeline, enableTermEntryContentDedup, termContentStorageMode, onChunk = void 0) { + this._lastFastTermBankReadProfile = null; + const bytes = await this._getData(termFile, new Uint8ArrayWriter()); + let wasmRowChunkSize = this._termBankWasmRowChunkSize; + if (this._adaptiveTermBankWasmRowChunkSizeTiered) { + if ( + bytes.byteLength >= ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_THRESHOLD_BYTES && + bytes.byteLength < ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_UPPER_BOUND_BYTES + ) { + wasmRowChunkSize = Math.max(wasmRowChunkSize, 4096); + } + } else if ( + this._adaptiveTermBankWasmRowChunkSize && + bytes.byteLength >= ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_THRESHOLD_BYTES + ) { + wasmRowChunkSize = Math.max(wasmRowChunkSize, 4096); + } + let wasmInitialMetaCapacityDivisor = this._termBankWasmInitialMetaCapacityDivisor; + let wasmInitialContentBytesPerRow = this._termBankWasmInitialContentBytesPerRow; + if ( + this._adaptiveTermBankWasmInitialCapacity && + bytes.byteLength >= ADAPTIVE_TERM_BANK_WASM_ROW_CHUNK_SIZE_THRESHOLD_BYTES + ) { + wasmInitialMetaCapacityDivisor = Math.min( + wasmInitialMetaCapacityDivisor, + ADAPTIVE_TERM_BANK_WASM_INITIAL_META_CAPACITY_DIVISOR, + ); + wasmInitialContentBytesPerRow = Math.max( + wasmInitialContentBytesPerRow, + ADAPTIVE_TERM_BANK_WASM_INITIAL_CONTENT_BYTES_PER_ROW, + ); + } + const streamToChunkHandler = typeof onChunk === 'function'; + if (streamToChunkHandler && !useMediaPipeline) { + wasmRowChunkSize = Math.max(wasmRowChunkSize, NO_MEDIA_FAST_PATH_TERM_BANK_WASM_ROW_CHUNK_SIZE); + } + /** @type {import('dictionary-importer').ImportRequirement[]|null} */ + const requirements = (useMediaPipeline && !streamToChunkHandler) ? [] : null; + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const termList = []; + const minimalDecode = this._wasmCanonicalRowsFastPath && !useMediaPipeline; + const usePrecomputedContentForMediaRows = useMediaPipeline && this._wasmPassThroughTermContent && this._usePrecomputedContentForMediaRows; + const useRawBytesDirectContent = (termContentStorageMode === 'raw-bytes' && !useMediaPipeline); + const includeContentMetadata = useRawBytesDirectContent ? false : (this._wasmPassThroughTermContent || !this._wasmSkipUnusedTermContentEncoding); + const useLazyGlossaryDecode = useRawBytesDirectContent || (useMediaPipeline && (this._lazyGlossaryDecodeForMedia || this._glossaryMediaFastScan || usePrecomputedContentForMediaRows)); + const useMediaHintFastScan = useMediaPipeline && ( + this._wasmPassThroughTermContent || + this._lazyGlossaryDecodeForMedia || + this._glossaryMediaFastScan || + usePrecomputedContentForMediaRows + ); + try { + let importerMaterializationMs = 0; + let importerChunkSinkMs = 0; + let importerChunkCount = 0; + let importerTotalRows = 0; + await parseTermBankWithWasmChunks( + bytes, + version, + async (parsedRows, chunkProgress) => { + ++importerChunkCount; + importerTotalRows = chunkProgress.processedRows; + /** @type {import('dictionary-importer').ImportRequirement[]|null} */ + const requirementsForChunk = useMediaPipeline ? [] : null; + if (requirementsForChunk !== null) { + requirementsForChunk.length = 0; + } + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const termListChunk = []; + termListChunk.length = parsedRows.length; + const tMaterializationStart = Date.now(); + for (let i = 0, ii = parsedRows.length; i < ii; ++i) { + const row = /** @type {ParsedTermBankChunkRow} */ (parsedRows[i]); + const expression = row.expression; + const reading = row.reading.length > 0 ? row.reading : expression; + const hasPrecomputedTermContent = hasPrecomputedTermEntryContent(row); + let usePrecomputedTermContent = false; + const useLeanTermEntryObject = ( + this._leanCanonicalTermEntryObjects && + requirementsForChunk === null && + hasPrecomputedTermContent + ); + /** @type {import('dictionary-database').DatabaseTermEntry} */ + const entry = useLeanTermEntryObject ? + { + expression, + reading, + definitionTags: '', + rules: '', + score: row.score, + glossary: [], + termTags: '', + dictionary: dictionaryTitle, + } : + { + expression, + reading, + definitionTags: row.definitionTags ?? '', + rules: row.rules ?? '', + score: row.score, + glossary: [], + termTags: row.termTags ?? '', + dictionary: dictionaryTitle, + }; + if (requirementsForChunk === null) { + if (typeof row.glossaryJson === 'string' && row.glossaryJson.length > 0) { + entry.glossaryJson = row.glossaryJson; + } + usePrecomputedTermContent = !useRawBytesDirectContent; + } else { + const skipGlossaryParse = ( + typeof row.glossaryMayContainMedia === 'boolean' ? + !row.glossaryMayContainMedia : + !this._glossaryJsonLikelyContainsMedia(this._getFastRowGlossaryJson(row)) + ); + if (skipGlossaryParse) { + if (!this._wasmPassThroughTermContent) { + entry.glossaryJson = this._getFastRowGlossaryJson(row); + } + usePrecomputedTermContent = true; + } else { + let glossaryList; + if (usePrecomputedContentForMediaRows && hasPrecomputedTermContent) { + const contentPayload = this._parseTermEntryContentFromFastRow(row, termFile.filename); + entry.rules = contentPayload.rules; + entry.definitionTags = contentPayload.definitionTags; + entry.termTags = contentPayload.termTags; + glossaryList = contentPayload.glossary; + } else { + const rowGlossaryJson = this._getFastRowGlossaryJson(row); + glossaryList = this._parseGlossaryJsonFromFastRow(rowGlossaryJson, termFile.filename); + } + for (let j = 0, jj = glossaryList.length; j < jj; ++j) { + const glossary = glossaryList[j]; + if (typeof glossary !== 'object' || glossary === null || Array.isArray(glossary)) { continue; } + glossaryList[j] = this._formatDictionaryTermGlossaryObject(glossary, entry, requirementsForChunk); + } + entry.glossary = glossaryList; + } + } + if (typeof row.sequence === 'number') { + entry.sequence = row.sequence; + } + this._assignPrefixReverseFields(entry, prefixWildcardsSupported); + if ( + usePrecomputedTermContent && + this._wasmPassThroughTermContent && + hasPrecomputedTermContent + ) { + if (typeof row.termEntryContentHash === 'string' && row.termEntryContentHash.length > 0) { + entry.termEntryContentHash = row.termEntryContentHash; + } + if (Number.isInteger(row.termEntryContentHash1) && Number.isInteger(row.termEntryContentHash2)) { + entry.termEntryContentHash1 = /** @type {number} */ (row.termEntryContentHash1); + entry.termEntryContentHash2 = /** @type {number} */ (row.termEntryContentHash2); + } + entry.termEntryContentBytes = row.termEntryContentBytes; + } + // Keep serialization canonical with the runtime deserializer. + if ( + requirementsForChunk === null || + ( + requirementsForChunk !== null && + ( + !hasPrecomputedTermEntryContent(entry) + ) + ) + ) { + if ( + requirementsForChunk !== null && + typeof entry.glossaryJson !== 'string' && + ( + !hasPrecomputedTermEntryContent(entry) + ) + ) { + entry.glossaryJson = this._getFastRowGlossaryJson(row); + } + this._prepareTermEntrySerialization( + entry, + enableTermEntryContentDedup, + ( + useRawBytesDirectContent && + requirementsForChunk === null && + row.glossaryJsonBytes instanceof Uint8Array + ) ? + row.glossaryJsonBytes : + null, + ); + } + termListChunk[i] = entry; + } + importerMaterializationMs += Math.max(0, Date.now() - tMaterializationStart); + + const tChunkSinkStart = Date.now(); + if (streamToChunkHandler) { + await /** @type {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} */ (onChunk)( + termListChunk, + requirementsForChunk, + chunkProgress, + ); + } else { + termList.push(...termListChunk); + if (requirements !== null && requirementsForChunk !== null) { + requirements.push(...requirementsForChunk); + } + } + importerChunkSinkMs += Math.max(0, Date.now() - tChunkSinkStart); + }, + wasmRowChunkSize, + { + copyContentBytes: this._wasmPassThroughTermContent && !streamToChunkHandler, + includeContentMetadata, + initialMetaCapacityDivisor: wasmInitialMetaCapacityDivisor, + initialContentBytesPerRow: wasmInitialContentBytesPerRow, + minimalDecode, + reuseExpressionForReadingDecode: this._wasmReuseExpressionForReadingDecode, + preallocateChunkRows: this._wasmPreallocateChunkRows, + skipTagRuleDecode: usePrecomputedContentForMediaRows, + lazyGlossaryDecode: useLazyGlossaryDecode, + mediaHintFastScan: useMediaHintFastScan, + }, + ); + const parserProfile = consumeLastTermBankWasmParseProfile(); + this._lastFastTermBankReadProfile = { + parserProfile, + materializationMs: importerMaterializationMs, + chunkSinkMs: importerChunkSinkMs, + chunkCount: importerChunkCount, + totalRows: importerTotalRows, + }; + } catch (error) { + consumeLastTermBankWasmParseProfile(); + throw toError(error); + } + return {termList, requirements}; + } + + /** + * @param {import('@zip.js/zip.js').Entry} termFile + * @param {string} dictionaryTitle + * @param {boolean} prefixWildcardsSupported + * @param {'baseline'|'raw-bytes'} termContentStorageMode + * @param {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} [onChunk] + * @param {number} [sharedGlossaryBaseOffset] + * @returns {Promise<{termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null}>} + */ + async _readTermBankArtifactFile(termFile, dictionaryTitle, prefixWildcardsSupported, termContentStorageMode, onChunk = void 0, sharedGlossaryBaseOffset = 0) { + this._lastArtifactTermBankReadProfile = null; + const tReadBytesStart = Date.now(); + const bytes = await this._getData(termFile, new Uint8ArrayWriter()); + const readBytesMs = Math.max(0, Date.now() - tReadBytesStart); + return await this._decodeTermBankArtifactBytes(bytes, termFile.filename, dictionaryTitle, prefixWildcardsSupported, termContentStorageMode, onChunk, readBytesMs, sharedGlossaryBaseOffset); + } + + /** + * @param {Uint8Array} bytes + * @param {string} filename + * @param {string} dictionaryTitle + * @param {boolean} prefixWildcardsSupported + * @param {'baseline'|'raw-bytes'} termContentStorageMode + * @param {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} [onChunk] + * @param {number} readBytesMs + * @param {number} [sharedGlossaryBaseOffset] + * @returns {Promise<{termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null}>} + */ + async _decodeTermBankArtifactBytes(bytes, filename, dictionaryTitle, prefixWildcardsSupported, termContentStorageMode, onChunk = void 0, readBytesMs = 0, sharedGlossaryBaseOffset = 0) { + const textDecoder = this._textDecoder; + if (bytes.byteLength < (TERM_BANK_ARTIFACT_MAGIC_BYTES + 4)) { + throw new Error(`Invalid term artifact payload in '${filename}': too small`); + } + const magic = textDecoder.decode(bytes.subarray(0, TERM_BANK_ARTIFACT_MAGIC_BYTES)); + if (magic !== TERM_BANK_ARTIFACT_MAGIC) { + throw new Error(`Invalid term artifact payload in '${filename}': bad magic`); + } + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + let cursor = TERM_BANK_ARTIFACT_MAGIC_BYTES; + const rowCount = view.getUint32(cursor, true); + cursor += 4; + const streamToChunkHandler = typeof onChunk === 'function'; + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const termList = streamToChunkHandler ? [] : new Array(rowCount); + const chunkSize = this._termArtifactRowChunkSize; + const chunkCount = Math.max(1, Math.ceil(rowCount / Math.max(1, chunkSize))); + let chunkIndex = 0; + const tDecodeRowsStart = Date.now(); + let decodeRowsMs = 0; + let reverseRowsMs = 0; + let importerChunkSinkMs = 0; + let metadataRebaseMs = 0; + for (let i = 0; i < rowCount; ++i) { + if ((cursor + 4) > bytes.byteLength) { + throw new Error(`Invalid term artifact payload in '${filename}': truncated expression length`); + } + const expressionLength = view.getUint32(cursor, true); + cursor += 4; + if ((cursor + expressionLength + 4) > bytes.byteLength) { + throw new Error(`Invalid term artifact payload in '${filename}': truncated expression`); + } + const expressionStart = cursor; + const expression = textDecoder.decode(bytes.subarray(cursor, cursor + expressionLength)); + cursor += expressionLength; + const readingLength = view.getUint32(cursor, true); + cursor += 4; + if ((cursor + readingLength + 20) > bytes.byteLength) { + throw new Error(`Invalid term artifact payload in '${filename}': truncated row payload`); + } + const readingStart = cursor; + const readingRaw = ( + readingLength > 0 && + readingLength === expressionLength && + byteRangeEqual(bytes, expressionStart, readingStart, expressionLength) + ) ? + expression : + textDecoder.decode(bytes.subarray(cursor, cursor + readingLength)); + cursor += readingLength; + const score = view.getInt32(cursor, true); + cursor += 4; + const sequenceRaw = view.getInt32(cursor, true); + cursor += 4; + const hash1 = view.getUint32(cursor, true); + cursor += 4; + const hash2 = view.getUint32(cursor, true); + cursor += 4; + const contentLength = view.getUint32(cursor, true); + cursor += 4; + if ((cursor + contentLength) > bytes.byteLength) { + throw new Error(`Invalid term artifact payload in '${filename}': truncated content bytes`); + } + const contentStart = cursor; + const contentEnd = contentStart + contentLength; + const reading = readingRaw.length > 0 ? readingRaw : expression; + const sequence = sequenceRaw >= 0 ? sequenceRaw : void 0; + let expressionReverse; + let readingReverse; + if (prefixWildcardsSupported) { + const tReverseStart = Date.now(); + const reversedExpression = this._reverseString(expression); + expressionReverse = reversedExpression; + readingReverse = (this._reuseExpressionReverseForReading && reading === expression) ? + reversedExpression : + this._reverseString(reading); + reverseRowsMs += Math.max(0, Date.now() - tReverseStart); + } + let contentBytes = bytes.subarray(contentStart, contentEnd); + if (sharedGlossaryBaseOffset > 0 && isRawTermContentSharedGlossaryBinary(contentBytes)) { + const tMetadataRebaseStart = Date.now(); + contentBytes = rebaseRawTermContentSharedGlossaryBinary(contentBytes, sharedGlossaryBaseOffset); + metadataRebaseMs += Math.max(0, Date.now() - tMetadataRebaseStart); + } + /** @type {import('dictionary-database').DatabaseTermEntry} */ + const entry = { + expression, + reading, + expressionReverse, + readingReverse, + definitionTags: null, + rules: '', + score, + glossary: EMPTY_TERM_GLOSSARY, + dictionary: dictionaryTitle, + termEntryContentHash1: hash1, + termEntryContentHash2: hash2, + termEntryContentBytes: contentBytes, + sequence, + }; + if (termContentStorageMode === 'raw-bytes' && isRawTermContentSharedGlossaryBinary(contentBytes)) { + entry.termEntryContentDictName = sharedGlossaryBaseOffset > 0 ? + RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME : + RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME; + } + this._normalizeArtifactTermEntryContent(entry, termContentStorageMode); + cursor = contentEnd; + if (streamToChunkHandler) { + termList.push(entry); + if (termList.length >= chunkSize) { + ++chunkIndex; + const tChunkSinkStart = Date.now(); + await /** @type {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} */ (onChunk)(termList, null, { + processedRows: i + 1, + totalRows: rowCount, + chunkIndex, + chunkCount, + }); + importerChunkSinkMs += Math.max(0, Date.now() - tChunkSinkStart); + termList.length = 0; + } + } else { + termList[i] = entry; + } + } + decodeRowsMs = Math.max(0, Date.now() - tDecodeRowsStart - reverseRowsMs); + if (streamToChunkHandler && termList.length > 0) { + ++chunkIndex; + const tChunkSinkStart = Date.now(); + await /** @type {(termList: import('dictionary-database').DatabaseTermEntry[], requirements: import('dictionary-importer').ImportRequirement[]|null, progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} */ (onChunk)(termList, null, { + processedRows: rowCount, + totalRows: rowCount, + chunkIndex, + chunkCount, + }); + importerChunkSinkMs += Math.max(0, Date.now() - tChunkSinkStart); + termList.length = 0; + } + this._lastArtifactTermBankReadProfile = { + readBytesMs, + decodeRowsMs, + reverseRowsMs, + metadataRebaseMs, + chunkSinkMs: importerChunkSinkMs, + chunkCount: streamToChunkHandler ? chunkIndex : 0, + totalRows: rowCount, + rowChunkSize: chunkSize, + }; + return {termList: streamToChunkHandler ? [] : termList, requirements: null}; + } + + /** + * @param {import('dictionary-database').DatabaseTermEntry} entry + * @param {'baseline'|'raw-bytes'} termContentStorageMode + * @returns {void} + */ + _normalizeArtifactTermEntryContent(entry, termContentStorageMode) { + if (termContentStorageMode !== 'raw-bytes') { + return; + } + const termEntryContentBytes = entry.termEntryContentBytes; + if (!(termEntryContentBytes instanceof Uint8Array) || termEntryContentBytes.byteLength === 0) { + return; + } + if ( + decodeRawTermContentBinary(termEntryContentBytes, this._textDecoder) !== null || + isRawTermContentSharedGlossaryBinary(termEntryContentBytes) + ) { + return; + } + const parsedContent = decodeRawTermContentBinary(termEntryContentBytes, this._textDecoder) ?? (() => { + try { + const value = /** @type {{rules?: unknown, definitionTags?: unknown, termTags?: unknown, glossary?: unknown}} */ ( + parseJson(this._textDecoder.decode(termEntryContentBytes)) + ); + return { + rules: typeof value.rules === 'string' ? value.rules : '', + definitionTags: typeof value.definitionTags === 'string' ? value.definitionTags : '', + termTags: typeof value.termTags === 'string' ? value.termTags : '', + glossaryJson: JSON.stringify(Array.isArray(value.glossary) ? value.glossary : []), + }; + } catch (_) { + return null; + } + })(); + if (parsedContent === null) { + return; + } + const glossaryJsonBytes = this._textEncoder.encode(parsedContent.glossaryJson); + entry.rules = parsedContent.rules; + entry.definitionTags = parsedContent.definitionTags; + entry.termTags = parsedContent.termTags; + const rawBytes = encodeRawTermContentBinary( + parsedContent.rules, + parsedContent.definitionTags, + parsedContent.termTags, + glossaryJsonBytes, + this._textEncoder, + ); + const [hash1, hash2] = this._hashEntryContentBytesPair(rawBytes); + entry.termEntryContentHash1 = hash1; + entry.termEntryContentHash2 = hash2; + entry.termEntryContentHash = hashPairToHex(hash1, hash2); + entry.termEntryContentBytes = rawBytes; + entry.termEntryContentRawGlossaryJsonBytes = void 0; + } + + /** + * @param {string} glossaryJson + * @returns {boolean} + */ + _glossaryJsonLikelyContainsMedia(glossaryJson) { + if (this._glossaryMediaFastScan) { + return this._glossaryJsonLikelyContainsMediaFast(glossaryJson); + } + return /"type"\s*:\s*"image"|"tag"\s*:\s*"img"/.test(glossaryJson); + } - return true; + /** + * @param {string} glossaryJson + * @returns {boolean} + */ + _glossaryJsonLikelyContainsMediaFast(glossaryJson) { + const hasTypeImage = glossaryJson.includes('"type"') && glossaryJson.includes('"image"'); + const hasTagImg = glossaryJson.includes('"tag"') && glossaryJson.includes('"img"'); + return hasTypeImage || hasTagImg; + } + + /** + * @param {{glossaryJson?: string, glossaryJsonBytes?: Uint8Array}} row + * @returns {string} + */ + _getFastRowGlossaryJson(row) { + if (typeof row.glossaryJson === 'string') { + return row.glossaryJson; + } + if (row.glossaryJsonBytes instanceof Uint8Array) { + const glossaryJson = this._textDecoder.decode(row.glossaryJsonBytes); + row.glossaryJson = glossaryJson; + return glossaryJson; + } + return '[]'; + } + + /** + * @param {import('dictionary-database').DatabaseTermEntry} entry + * @param {boolean} prefixWildcardsSupported + */ + _assignPrefixReverseFields(entry, prefixWildcardsSupported) { + if (!prefixWildcardsSupported) { + return; + } + const expressionReverse = this._reverseString(entry.expression); + entry.expressionReverse = expressionReverse; + if (this._reuseExpressionReverseForReading && entry.reading === entry.expression) { + entry.readingReverse = expressionReverse; + return; + } + entry.readingReverse = this._reverseString(entry.reading); + } + + /** + * @param {string} value + * @returns {string} + */ + _reverseString(value) { + if (!this._cacheReverseStrings) { + return this._fastPrefixReverse ? + reverseUtf16PreserveSurrogates(value) : + stringReverse(value); + } + const cached = this._reverseStringCache.get(value); + if (typeof cached === 'string') { + return cached; + } + const reversed = this._fastPrefixReverse ? + reverseUtf16PreserveSurrogates(value) : + stringReverse(value); + if (this._reverseStringCache.size >= this._reverseStringCacheMaxEntries) { + this._reverseStringCache.clear(); + } + this._reverseStringCache.set(value, reversed); + return reversed; + } + + /** + * @param {string} glossaryJson + * @param {string} fileName + * @returns {import('dictionary-data').TermGlossary[]} + * @throws {Error} + */ + _parseGlossaryJsonFromFastRow(glossaryJson, fileName) { + try { + const glossary = /** @type {unknown} */ (parseJson(glossaryJson)); + return Array.isArray(glossary) ? glossary : []; + } catch (error) { + if (error instanceof Error) { + throw new Error(error.message + ` in '${fileName}'`); + } + throw error; + } + } + + /** + * @param {{termEntryContentBytes?: Uint8Array}} row + * @param {string} fileName + * @returns {{rules: string, definitionTags: string, termTags: string, glossary: import('dictionary-data').TermGlossary[]}} + * @throws {Error} + */ + _parseTermEntryContentFromFastRow(row, fileName) { + if (!(row.termEntryContentBytes instanceof Uint8Array) || row.termEntryContentBytes.byteLength === 0) { + throw new Error(`Invalid precomputed term content in '${fileName}': missing content bytes`); + } + const termEntryContentBytes = row.termEntryContentBytes; + try { + const rawContent = decodeRawTermContentBinary(termEntryContentBytes, this._textDecoder); + if (rawContent !== null) { + return { + rules: rawContent.rules, + definitionTags: rawContent.definitionTags, + termTags: rawContent.termTags, + glossary: this._parseGlossaryJsonFromFastRow(rawContent.glossaryJson, fileName), + }; + } + const value = /** @type {{rules?: unknown, definitionTags?: unknown, termTags?: unknown, glossary?: unknown}} */ ( + parseJson(this._textDecoder.decode(termEntryContentBytes)) + ); + const rules = typeof value.rules === 'string' ? value.rules : ''; + const definitionTags = typeof value.definitionTags === 'string' ? value.definitionTags : ''; + const termTags = typeof value.termTags === 'string' ? value.termTags : ''; + const glossary = Array.isArray(value.glossary) ? /** @type {import('dictionary-data').TermGlossary[]} */ (value.glossary) : []; + return {rules, definitionTags, termTags, glossary}; + } catch (error) { + if (error instanceof Error) { + throw new Error(error.message + ` in '${fileName}'`); + } + throw error; + } } /** diff --git a/ext/js/dictionary/dictionary-worker-handler.js b/ext/js/dictionary/dictionary-worker-handler.js index 6c27d0b478..e432297390 100644 --- a/ext/js/dictionary/dictionary-worker-handler.js +++ b/ext/js/dictionary/dictionary-worker-handler.js @@ -88,6 +88,17 @@ export class DictionaryWorkerHandler { async _importDictionary({details, archiveContent}, onProgress) { const dictionaryDatabase = await this._getPreparedDictionaryDatabase(); try { + if ( + typeof dictionaryDatabase.usesFallbackStorage === 'function' && + dictionaryDatabase.usesFallbackStorage() + ) { + const diagnostics = ( + typeof dictionaryDatabase.getOpenStorageDiagnostics === 'function' ? + dictionaryDatabase.getOpenStorageDiagnostics() : + null + ); + throw new Error(`OPFS is required for dictionary import. diagnostics=${JSON.stringify(diagnostics)}`); + } const dictionaryImporter = new DictionaryImporter(this._mediaLoader, onProgress); const {result, errors} = await dictionaryImporter.importDictionary(dictionaryDatabase, archiveContent, details); return { diff --git a/ext/js/dictionary/raw-term-content.js b/ext/js/dictionary/raw-term-content.js new file mode 100644 index 0000000000..994f5169de --- /dev/null +++ b/ext/js/dictionary/raw-term-content.js @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const RAW_TERM_CONTENT_MAGIC = new Uint8Array([0x4d, 0x42, 0x52, 0x31]); +const RAW_TERM_CONTENT_HEADER_BYTES = 20; +const RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC = new Uint8Array([0x4d, 0x42, 0x52, 0x32]); +const RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES = 28; + +export const RAW_TERM_CONTENT_DICT_NAME = 'raw-v2'; + +export const RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME = 'raw-v3'; + +export const RAW_TERM_CONTENT_COMPRESSED_SHARED_GLOSSARY_DICT_NAME = 'raw-v4'; + +/** + * @param {Uint8Array} bytes + * @param {TextDecoder} textDecoder + * @returns {{rules: string, definitionTags: string, termTags: string, glossaryJsonOffset: number, glossaryJsonLength: number}|null} + */ +export function decodeRawTermContentHeader(bytes, textDecoder) { + if (!isRawTermContentBinary(bytes)) { + return null; + } + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + const rulesLength = view.getUint32(4, true); + const definitionTagsLength = view.getUint32(8, true); + const termTagsLength = view.getUint32(12, true); + const glossaryJsonLength = view.getUint32(16, true); + const totalLength = RAW_TERM_CONTENT_HEADER_BYTES + rulesLength + definitionTagsLength + termTagsLength + glossaryJsonLength; + if (totalLength !== bytes.byteLength) { + return null; + } + let offset = RAW_TERM_CONTENT_HEADER_BYTES; + const rules = textDecoder.decode(bytes.subarray(offset, offset + rulesLength)); + offset += rulesLength; + const definitionTags = textDecoder.decode(bytes.subarray(offset, offset + definitionTagsLength)); + offset += definitionTagsLength; + const termTags = textDecoder.decode(bytes.subarray(offset, offset + termTagsLength)); + offset += termTagsLength; + return {rules, definitionTags, termTags, glossaryJsonOffset: offset, glossaryJsonLength}; +} + +/** + * @param {Uint8Array} bytes + * @param {number} offset + * @param {number} length + * @returns {Uint8Array} + */ +export function getRawTermContentGlossaryJsonBytes(bytes, offset, length) { + return bytes.subarray(offset, offset + length); +} + +/** + * @param {Uint8Array} bytes + * @returns {boolean} + */ +export function isRawTermContentBinary(bytes) { + return ( + bytes.byteLength >= RAW_TERM_CONTENT_HEADER_BYTES && + bytes[0] === RAW_TERM_CONTENT_MAGIC[0] && + bytes[1] === RAW_TERM_CONTENT_MAGIC[1] && + bytes[2] === RAW_TERM_CONTENT_MAGIC[2] && + bytes[3] === RAW_TERM_CONTENT_MAGIC[3] + ); +} + +/** + * @param {Uint8Array} bytes + * @returns {boolean} + */ +export function isRawTermContentSharedGlossaryBinary(bytes) { + return ( + bytes.byteLength >= RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES && + bytes[0] === RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC[0] && + bytes[1] === RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC[1] && + bytes[2] === RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC[2] && + bytes[3] === RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC[3] + ); +} + +/** + * @param {string} rules + * @param {string} definitionTags + * @param {string} termTags + * @param {Uint8Array} glossaryJsonBytes + * @param {TextEncoder} textEncoder + * @returns {Uint8Array} + */ +export function encodeRawTermContentBinary(rules, definitionTags, termTags, glossaryJsonBytes, textEncoder) { + const rulesBytes = textEncoder.encode(rules); + const definitionTagsBytes = textEncoder.encode(definitionTags); + const termTagsBytes = textEncoder.encode(termTags); + const totalBytes = ( + RAW_TERM_CONTENT_HEADER_BYTES + + rulesBytes.byteLength + + definitionTagsBytes.byteLength + + termTagsBytes.byteLength + + glossaryJsonBytes.byteLength + ); + const bytes = new Uint8Array(totalBytes); + bytes.set(RAW_TERM_CONTENT_MAGIC, 0); + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + view.setUint32(4, rulesBytes.byteLength, true); + view.setUint32(8, definitionTagsBytes.byteLength, true); + view.setUint32(12, termTagsBytes.byteLength, true); + view.setUint32(16, glossaryJsonBytes.byteLength, true); + let offset = RAW_TERM_CONTENT_HEADER_BYTES; + bytes.set(rulesBytes, offset); + offset += rulesBytes.byteLength; + bytes.set(definitionTagsBytes, offset); + offset += definitionTagsBytes.byteLength; + bytes.set(termTagsBytes, offset); + offset += termTagsBytes.byteLength; + bytes.set(glossaryJsonBytes, offset); + return bytes; +} + +/** + * @param {string} rules + * @param {string} definitionTags + * @param {string} termTags + * @param {number} glossaryOffset + * @param {number} glossaryLength + * @param {TextEncoder} textEncoder + * @returns {Uint8Array} + */ +export function encodeRawTermContentSharedGlossaryBinary(rules, definitionTags, termTags, glossaryOffset, glossaryLength, textEncoder) { + const rulesBytes = textEncoder.encode(rules); + const definitionTagsBytes = textEncoder.encode(definitionTags); + const termTagsBytes = textEncoder.encode(termTags); + const totalBytes = ( + RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES + + rulesBytes.byteLength + + definitionTagsBytes.byteLength + + termTagsBytes.byteLength + ); + const bytes = new Uint8Array(totalBytes); + bytes.set(RAW_TERM_CONTENT_SHARED_GLOSSARY_MAGIC, 0); + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + view.setUint32(4, rulesBytes.byteLength, true); + view.setUint32(8, definitionTagsBytes.byteLength, true); + view.setUint32(12, termTagsBytes.byteLength, true); + view.setBigUint64(16, BigInt(glossaryOffset), true); + view.setUint32(24, glossaryLength, true); + let offset = RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES; + bytes.set(rulesBytes, offset); + offset += rulesBytes.byteLength; + bytes.set(definitionTagsBytes, offset); + offset += definitionTagsBytes.byteLength; + bytes.set(termTagsBytes, offset); + return bytes; +} + +/** + * @param {Uint8Array} bytes + * @param {number} baseOffset + * @returns {Uint8Array} + */ +export function rebaseRawTermContentSharedGlossaryBinary(bytes, baseOffset) { + if (!isRawTermContentSharedGlossaryBinary(bytes) || baseOffset === 0) { + return bytes; + } + const header = decodeRawTermContentSharedGlossaryHeader(bytes, new TextDecoder()); + if (header === null) { + return bytes; + } + const rebasedBytes = Uint8Array.from(bytes); + const view = new DataView(rebasedBytes.buffer, rebasedBytes.byteOffset, rebasedBytes.byteLength); + view.setBigUint64(16, BigInt(header.glossaryOffset + baseOffset), true); + return rebasedBytes; +} + +/** + * @param {Uint8Array} bytes + * @param {TextDecoder} textDecoder + * @returns {{rules: string, definitionTags: string, termTags: string, glossaryOffset: number, glossaryLength: number}|null} + */ +export function decodeRawTermContentSharedGlossaryHeader(bytes, textDecoder) { + if (!isRawTermContentSharedGlossaryBinary(bytes)) { + return null; + } + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + const rulesLength = view.getUint32(4, true); + const definitionTagsLength = view.getUint32(8, true); + const termTagsLength = view.getUint32(12, true); + const glossaryOffset = Number(view.getBigUint64(16, true)); + const glossaryLength = view.getUint32(24, true); + const totalLength = RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES + rulesLength + definitionTagsLength + termTagsLength; + if (totalLength !== bytes.byteLength) { + return null; + } + let offset = RAW_TERM_CONTENT_SHARED_GLOSSARY_HEADER_BYTES; + const rules = textDecoder.decode(bytes.subarray(offset, offset + rulesLength)); + offset += rulesLength; + const definitionTags = textDecoder.decode(bytes.subarray(offset, offset + definitionTagsLength)); + offset += definitionTagsLength; + const termTags = textDecoder.decode(bytes.subarray(offset, offset + termTagsLength)); + return {rules, definitionTags, termTags, glossaryOffset, glossaryLength}; +} + +/** + * @param {Uint8Array} bytes + * @param {TextDecoder} textDecoder + * @returns {{rules: string, definitionTags: string, termTags: string, glossaryJson: string}|null} + */ +export function decodeRawTermContentBinary(bytes, textDecoder) { + const header = decodeRawTermContentHeader(bytes, textDecoder); + if (header === null) { + return null; + } + const glossaryJson = textDecoder.decode(getRawTermContentGlossaryJsonBytes(bytes, header.glossaryJsonOffset, header.glossaryJsonLength)); + return {rules: header.rules, definitionTags: header.definitionTags, termTags: header.termTags, glossaryJson}; +} diff --git a/ext/js/dictionary/sqlite-wasm.js b/ext/js/dictionary/sqlite-wasm.js new file mode 100644 index 0000000000..ae5ae3922c --- /dev/null +++ b/ext/js/dictionary/sqlite-wasm.js @@ -0,0 +1,688 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import sqlite3InitModule from '../../lib/sqlite/index.mjs'; +import {reportDiagnostics} from '../core/diagnostics-reporter.js'; + +export const DICTIONARY_DB_FILE = '/dict.sqlite3'; + +const DICTIONARY_DB_FILE_ALT = 'dict.sqlite3'; +const OPFS_SAHPOOL_VFS_NAME = 'opfs-sahpool'; + +let lastOpenUsedFallbackStorage = false; +/** @type {{mode: string, caller: string, runtimeContext: ReturnType|null, forceFallback: boolean, opfsReadyTimeoutMs: number, opfsReadyWait: {attempts: number, elapsedMs: number, ready: boolean}|null, hasOpfsDbCtor: boolean, hasOpfsImportDb: boolean, hasWasmfsDir: boolean, hasInstallOpfsSAHPoolVfs: boolean, hasOpfsVfs: boolean, hasOpfsSahpoolVfs: boolean, opfsVfsPtr: string|number|null, opfsSahpoolVfsPtr: string|number|null, openFailureClass: 'unsupported-opfs'|'lock-contention'|'corruption'|'transient-open-race'|'unknown'|null, attempts?: Array<{strategy: string, target: string, flags: string, error: string, errorClass: 'unsupported-opfs'|'lock-contention'|'corruption'|'transient-open-race'|'unknown'}>, lastError?: string|null}} */ +let lastOpenStorageDiagnostics = { + mode: 'unknown', + caller: 'unknown', + runtimeContext: null, + forceFallback: false, + opfsReadyTimeoutMs: 0, + opfsReadyWait: null, + hasOpfsDbCtor: false, + hasOpfsImportDb: false, + hasWasmfsDir: false, + hasInstallOpfsSAHPoolVfs: false, + hasOpfsVfs: false, + hasOpfsSahpoolVfs: false, + opfsVfsPtr: null, + opfsSahpoolVfsPtr: null, + openFailureClass: null, + attempts: [], + lastError: null, +}; + +/** + * @typedef {object} SqliteOpfsApi + * @property {(path: string, recursive?: boolean, throwIfNotFound?: boolean) => Promise|void} [unlink] + * @property {(path: string, content: Uint8Array) => Promise|void} [importDb] + */ + +/** @type {Promise|null} */ +let sqlite3Promise = null; +/** @type {Promise|null} */ +let opfsSahpoolInstallPromise = null; +/** @type {boolean} */ +let sqliteInitDiagnosticsReported = false; + +/** + * @returns {{href: string|null, origin: string|null, globalConstructor: string|null, isWindow: boolean, isWorkerGlobalScope: boolean, isServiceWorkerGlobalScope: boolean, crossOriginIsolated: boolean|null, hasSharedArrayBuffer: boolean, hasAtomics: boolean, hasNavigatorStorage: boolean, hasStorageGetDirectory: boolean, hasFileSystemHandle: boolean, hasFileSystemDirectoryHandle: boolean, hasFileSystemFileHandle: boolean, hasCreateSyncAccessHandle: boolean, userAgent: string|null}} + */ +function getRuntimeContextDiagnostics() { + const locationValue = /** @type {Record} */ (/** @type {unknown} */ (Reflect.get(globalThis, 'location') ?? {})); + const navigatorValue = /** @type {Record} */ (/** @type {unknown} */ (Reflect.get(globalThis, 'navigator') ?? {})); + const storageValue = /** @type {Record} */ (Reflect.get(navigatorValue, 'storage') ?? {}); + const ctorValue = /** @type {unknown} */ (Reflect.get(globalThis, 'constructor')); + const ctorRecord = (typeof ctorValue === 'function') ? /** @type {Record} */ (/** @type {unknown} */ (ctorValue)) : null; + const ctorNameValue = ctorRecord !== null ? Reflect.get(ctorRecord, 'name') : null; + const globalCtorName = typeof ctorNameValue === 'string' ? ctorNameValue : null; + const isWindow = (typeof Window === 'function' && globalThis instanceof Window); + const isWorkerGlobalScope = (typeof WorkerGlobalScope === 'function' && globalThis instanceof WorkerGlobalScope); + const isServiceWorkerGlobalScope = (typeof ServiceWorkerGlobalScope === 'function' && globalThis instanceof ServiceWorkerGlobalScope); + const crossOriginIsolatedValue = Reflect.get(globalThis, 'crossOriginIsolated'); + return { + href: typeof locationValue.href === 'string' ? locationValue.href : null, + origin: typeof locationValue.origin === 'string' ? locationValue.origin : null, + globalConstructor: globalCtorName, + isWindow, + isWorkerGlobalScope, + isServiceWorkerGlobalScope, + crossOriginIsolated: typeof crossOriginIsolatedValue === 'boolean' ? crossOriginIsolatedValue : null, + hasSharedArrayBuffer: typeof Reflect.get(globalThis, 'SharedArrayBuffer') === 'function', + hasAtomics: typeof Reflect.get(globalThis, 'Atomics') === 'object' && Reflect.get(globalThis, 'Atomics') !== null, + hasNavigatorStorage: typeof navigatorValue === 'object' && navigatorValue !== null && typeof storageValue === 'object' && storageValue !== null, + hasStorageGetDirectory: typeof storageValue.getDirectory === 'function', + hasFileSystemHandle: typeof Reflect.get(globalThis, 'FileSystemHandle') === 'function', + hasFileSystemDirectoryHandle: typeof Reflect.get(globalThis, 'FileSystemDirectoryHandle') === 'function', + hasFileSystemFileHandle: typeof Reflect.get(globalThis, 'FileSystemFileHandle') === 'function', + hasCreateSyncAccessHandle: ( + typeof Reflect.get(globalThis, 'FileSystemFileHandle') === 'function' && + typeof Reflect.get( + /** @type {{prototype?: Record}} */ (/** @type {unknown} */ (Reflect.get(globalThis, 'FileSystemFileHandle'))).prototype ?? {}, + 'createSyncAccessHandle', + ) === 'function' + ), + userAgent: typeof navigatorValue.userAgent === 'string' ? navigatorValue.userAgent : null, + }; +} + +/** + * @returns {string[]} + */ +function getDatabasePaths() { + return [DICTIONARY_DB_FILE, DICTIONARY_DB_FILE_ALT]; +} + +/** + * @param {import('@sqlite.org/sqlite-wasm').Sqlite3Static} sqlite3 + * @returns {string[]} + */ +function getWasmfsDatabasePaths(sqlite3) { + const getOpfsDir = sqlite3?.capi?.sqlite3_wasmfs_opfs_dir; + if (typeof getOpfsDir !== 'function') { + return []; + } + let opfsDir = ''; + try { + opfsDir = String(getOpfsDir() ?? ''); + } catch (_) { + return []; + } + if (!/^\/[^/]+$/.test(opfsDir)) { + return []; + } + return [`${opfsDir}/dict.sqlite3`]; +} + +/** + * @param {unknown} pointer + * @returns {boolean} + */ +function isNonZeroPointer(pointer) { + if (typeof pointer === 'number') { + return Number.isFinite(pointer) && pointer !== 0; + } + if (typeof pointer === 'bigint') { + return pointer !== 0n; + } + return pointer !== null && typeof pointer !== 'undefined'; +} + +/** + * @param {unknown} pointer + * @returns {string|number|null} + */ +function serializePointer(pointer) { + if (typeof pointer === 'number') { + return Number.isFinite(pointer) ? pointer : null; + } + if (typeof pointer === 'bigint') { + return pointer.toString(); + } + return null; +} + +/** + * @param {import('@sqlite.org/sqlite-wasm').Sqlite3Static} sqlite3 + * @returns {{OpfsDb: unknown, opfs: SqliteOpfsApi|undefined, hasOpfsDbCtor: boolean, hasOpfsImportDb: boolean, hasWasmfsDir: boolean, hasOpfsVfs: boolean, hasOpfsSahpoolVfs: boolean, opfsVfsPtr: string|number|null, opfsSahpoolVfsPtr: string|number|null}} + */ +function getOpfsCapabilitySnapshot(sqlite3) { + const OpfsDb = /** @type {unknown} */ (sqlite3?.oo1?.OpfsDb); + const opfs = /** @type {{opfs?: SqliteOpfsApi}} */ (/** @type {unknown} */ (sqlite3)).opfs; + const findVfs = sqlite3?.capi?.sqlite3_vfs_find; + const opfsVfsRaw = typeof findVfs === 'function' ? findVfs('opfs') : null; + const opfsSahpoolVfsRaw = typeof findVfs === 'function' ? findVfs(OPFS_SAHPOOL_VFS_NAME) : null; + return { + OpfsDb, + opfs, + hasOpfsDbCtor: typeof OpfsDb === 'function', + hasOpfsImportDb: typeof opfs?.importDb === 'function', + hasWasmfsDir: getWasmfsDatabasePaths(sqlite3).length > 0, + hasOpfsVfs: isNonZeroPointer(opfsVfsRaw), + hasOpfsSahpoolVfs: isNonZeroPointer(opfsSahpoolVfsRaw), + opfsVfsPtr: serializePointer(opfsVfsRaw), + opfsSahpoolVfsPtr: serializePointer(opfsSahpoolVfsRaw), + }; +} + +/** + * @param {number} ms + * @returns {Promise} + */ +function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +/** + * @param {string} message + * @returns {'unsupported-opfs'|'lock-contention'|'corruption'|'transient-open-race'|'unknown'} + */ +function classifyOpenFailureMessage(message) { + const text = message.toLowerCase(); + if ( + text.includes('no such vfs') || + text.includes('opfs is required') || + text.includes('missing sharedarraybuffer') || + text.includes('crossoriginisolated') + ) { + return 'unsupported-opfs'; + } + if ( + text.includes('sqlite_busy') || + text.includes('database is locked') || + text.includes('database table is locked') || + text.includes('locked') + ) { + return 'lock-contention'; + } + if ( + text.includes('sqlite_corrupt') || + text.includes('database disk image is malformed') || + text.includes('file is not a database') + ) { + return 'corruption'; + } + if ( + text.includes('sqlite_cantopen') || + text.includes('unable to open database file') + ) { + return 'transient-open-race'; + } + return 'unknown'; +} + +/** + * @param {number} attempt + * @returns {number} + */ +function getOpenRetryDelayMs(attempt) { + const base = Math.min(1000, 50 * (2 ** attempt)); + const jitter = Math.floor(Math.random() * 25); + return base + jitter; +} + +/** + * @param {SqliteOpfsApi|undefined} opfs + * @returns {Promise} + */ +async function deleteOpfsDatabaseFilesInternal(opfs) { + if (typeof opfs?.unlink !== 'function') { + return; + } + /** @type {string[]} */ + const targets = []; + for (const dbPath of getDatabasePaths()) { + targets.push(dbPath, `${dbPath}-wal`, `${dbPath}-shm`); + } + for (const target of targets) { + try { + await opfs.unlink(target, false, false); + } catch (_) { + // NOP - continue best-effort cleanup. + } + } +} + +/** + * @returns {Promise} + */ +export async function getSqlite3() { + if (sqlite3Promise !== null) { + return await sqlite3Promise; + } + const initWithOptions = /** @type {(options: {locateFile: (file: string) => string}) => Promise} */ (sqlite3InitModule); + /** + * @param {string} file + * @returns {string} + */ + const locateFile = (file) => new URL(`../../lib/sqlite/${file}`, import.meta.url).href; + sqlite3Promise = initWithOptions({ + locateFile, + }); + const sqlite3 = await sqlite3Promise; + if (!sqliteInitDiagnosticsReported) { + sqliteInitDiagnosticsReported = true; + const snapshot = getOpfsCapabilitySnapshot(sqlite3); + reportDiagnostics('opfs-sqlite-init', { + context: getRuntimeContextDiagnostics(), + hasOpfsDbCtor: snapshot.hasOpfsDbCtor, + hasInstallOpfsSAHPoolVfs: typeof Reflect.get(sqlite3, 'installOpfsSAHPoolVfs') === 'function', + hasOpfsImportDb: snapshot.hasOpfsImportDb, + hasWasmfsDir: snapshot.hasWasmfsDir, + hasOpfsVfs: snapshot.hasOpfsVfs, + hasOpfsSahpoolVfs: snapshot.hasOpfsSahpoolVfs, + opfsVfsPtr: snapshot.opfsVfsPtr, + opfsSahpoolVfsPtr: snapshot.opfsSahpoolVfsPtr, + sqliteVersion: sqlite3?.version?.libVersion ?? null, + }); + } + return sqlite3; +} + +/** + * @param {string} [caller] + * @returns {Promise} + */ +export async function openOpfsDatabase(caller = 'unknown') { + lastOpenUsedFallbackStorage = false; + const sqlite3 = await getSqlite3(); + const allowFallback = ( + Reflect.get(globalThis, 'yomitanRequireOpfs') === false || + Reflect.get(globalThis, 'yomitanAllowSqliteMemoryFallback') === true || + typeof Reflect.get(globalThis, 'chrome') === 'undefined' + ); + const forceFallback = Reflect.get(globalThis, 'yomitanForceSqliteFallback') === true; + const installOpfsSAHPoolVfs = /** @type {unknown} */ (Reflect.get(sqlite3, 'installOpfsSAHPoolVfs')); + const contextDiagnostics = getRuntimeContextDiagnostics(); + const opfsReadyTimeoutMsRaw = /** @type {unknown} */ (Reflect.get(globalThis, 'yomitanOpfsReadyTimeoutMs')); + const opfsReadyTimeoutMs = ( + typeof opfsReadyTimeoutMsRaw === 'number' && + Number.isFinite(opfsReadyTimeoutMsRaw) && + opfsReadyTimeoutMsRaw >= 0 + ) ? + opfsReadyTimeoutMsRaw : + 3000; + let capability = getOpfsCapabilitySnapshot(sqlite3); + lastOpenStorageDiagnostics = { + mode: 'opening', + caller, + runtimeContext: contextDiagnostics, + forceFallback, + opfsReadyTimeoutMs, + opfsReadyWait: null, + hasOpfsDbCtor: capability.hasOpfsDbCtor, + hasOpfsImportDb: capability.hasOpfsImportDb, + hasWasmfsDir: capability.hasWasmfsDir, + hasInstallOpfsSAHPoolVfs: typeof installOpfsSAHPoolVfs === 'function', + hasOpfsVfs: capability.hasOpfsVfs, + hasOpfsSahpoolVfs: capability.hasOpfsSahpoolVfs, + opfsVfsPtr: capability.opfsVfsPtr, + opfsSahpoolVfsPtr: capability.opfsSahpoolVfsPtr, + openFailureClass: null, + attempts: [], + lastError: null, + }; + const attempts = /** @type {Array<{strategy: string, target: string, flags: string, error: string, errorClass: 'unsupported-opfs'|'lock-contention'|'corruption'|'transient-open-race'|'unknown'}>} */ (lastOpenStorageDiagnostics.attempts); + /** + * @param {string} strategy + * @param {string} target + * @param {string} flags + * @param {unknown} error + */ + const pushAttemptError = (strategy, target, flags, error) => { + const message = (error instanceof Error) ? error.message : String(error); + const errorClass = classifyOpenFailureMessage(message); + attempts.push({strategy, target, flags, error: message, errorClass}); + if (attempts.length > 40) { + attempts.shift(); + } + lastOpenStorageDiagnostics.lastError = message; + lastOpenStorageDiagnostics.openFailureClass = errorClass; + }; + /** + * @returns {ReturnType} + */ + const syncCapabilityIntoDiagnostics = () => { + capability = getOpfsCapabilitySnapshot(sqlite3); + lastOpenStorageDiagnostics.hasOpfsDbCtor = capability.hasOpfsDbCtor; + lastOpenStorageDiagnostics.hasOpfsImportDb = capability.hasOpfsImportDb; + lastOpenStorageDiagnostics.hasWasmfsDir = capability.hasWasmfsDir; + lastOpenStorageDiagnostics.hasOpfsVfs = capability.hasOpfsVfs; + lastOpenStorageDiagnostics.hasOpfsSahpoolVfs = capability.hasOpfsSahpoolVfs; + lastOpenStorageDiagnostics.opfsVfsPtr = capability.opfsVfsPtr; + lastOpenStorageDiagnostics.opfsSahpoolVfsPtr = capability.opfsSahpoolVfsPtr; + return capability; + }; + /** + * @returns {boolean} + */ + const isOpfsReadyForOpen = () => ( + capability.hasOpfsDbCtor || + capability.hasOpfsVfs || + capability.hasWasmfsDir || + capability.hasOpfsImportDb || + capability.hasOpfsSahpoolVfs + ); + reportDiagnostics('opfs-open-begin', { + caller, + allowFallback, + forceFallback, + opfsReadyTimeoutMs, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + if (forceFallback) { + lastOpenStorageDiagnostics.mode = 'forced-fallback-disallowed'; + reportDiagnostics('opfs-open-failed', { + stage: 'forced-fallback-disallowed', + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + throw new Error(`OPFS is required; forced fallback is disabled. diagnostics=${JSON.stringify(lastOpenStorageDiagnostics)}`); + } + if (!isOpfsReadyForOpen() && opfsReadyTimeoutMs > 0) { + const start = Date.now(); + let waitAttempts = 0; + while ((Date.now() - start) < opfsReadyTimeoutMs) { + await sleep(100); + ++waitAttempts; + syncCapabilityIntoDiagnostics(); + if (isOpfsReadyForOpen()) { break; } + } + lastOpenStorageDiagnostics.opfsReadyWait = { + attempts: waitAttempts, + elapsedMs: Date.now() - start, + ready: isOpfsReadyForOpen(), + }; + } + syncCapabilityIntoDiagnostics(); + /** + * @returns {import('@sqlite.org/sqlite-wasm').Database|null} + */ + const tryOpenWasmfsPersistent = () => { + const persistentPaths = getWasmfsDatabasePaths(sqlite3); + for (const dbPath of persistentPaths) { + for (const flags of ['cw', 'c']) { + try { + return new sqlite3.oo1.DB(dbPath, flags); + } catch (error) { + pushAttemptError('wasmfs-persistent', dbPath, flags, error); + // Try the next wasmfs path/flag combination. + } + } + } + return null; + }; + /** + * @returns {import('@sqlite.org/sqlite-wasm').Database|null} + */ + const tryOpenViaUri = () => { + for (const dbPath of getDatabasePaths()) { + const uri = `file:${dbPath}?vfs=opfs`; + for (const flags of ['cw', 'c']) { + try { + return new sqlite3.oo1.DB(uri, flags); + } catch (error) { + pushAttemptError('uri-opfs', uri, flags, error); + // Try the next URI/flag combination. + } + } + } + return null; + }; + /** + * @returns {import('@sqlite.org/sqlite-wasm').Database|null} + */ + const tryOpenViaSahpoolUri = () => { + for (const dbPath of getDatabasePaths()) { + const uri = `file:${dbPath}?vfs=${OPFS_SAHPOOL_VFS_NAME}`; + for (const flags of ['cw', 'c']) { + try { + return new sqlite3.oo1.DB(uri, flags); + } catch (error) { + pushAttemptError('uri-opfs-sahpool', uri, flags, error); + // Try the next URI/flag combination. + } + } + } + return null; + }; + /** + * @returns {Promise} + */ + const ensureOpfsSahpoolVfs = async () => { + const findVfs2 = sqlite3?.capi?.sqlite3_vfs_find; + if (typeof findVfs2 === 'function' && isNonZeroPointer(findVfs2(OPFS_SAHPOOL_VFS_NAME))) { + return true; + } + if (typeof installOpfsSAHPoolVfs !== 'function') { + return false; + } + if (opfsSahpoolInstallPromise === null) { + opfsSahpoolInstallPromise = (async () => { + try { + await /** @type {(opts: {name?: string}) => Promise} */ (installOpfsSAHPoolVfs)({ + name: OPFS_SAHPOOL_VFS_NAME, + }); + return true; + } catch (error) { + pushAttemptError('install-opfs-sahpool-vfs', OPFS_SAHPOOL_VFS_NAME, '-', error); + return false; + } + })(); + } + const installed = await opfsSahpoolInstallPromise; + syncCapabilityIntoDiagnostics(); + if (typeof findVfs2 === 'function') { + return isNonZeroPointer(findVfs2(OPFS_SAHPOOL_VFS_NAME)); + } + return installed; + }; + if (typeof sqlite3.oo1.OpfsDb === 'function') { + const OpfsDb = sqlite3.oo1.OpfsDb; + /** + * @returns {import('@sqlite.org/sqlite-wasm').Database|null} + */ + const tryOpen = () => { + for (const dbPath of getDatabasePaths()) { + for (const flags of ['cw', 'c']) { + try { + return new OpfsDb(dbPath, flags); + } catch (error) { + pushAttemptError('opfsdb', dbPath, flags, error); + // Try the next path/flag combination. + } + } + } + return null; + }; + + /** + * @returns {Promise} + */ + const tryOpenWithRetry = async () => { + const maxAttempts = 8; + for (let attempt = 0; attempt < maxAttempts; ++attempt) { + const opened = tryOpen(); + if (opened !== null) { return opened; } + const retryableError = attempts.length > 0 ? attempts[attempts.length - 1].error : ''; + const retryableErrorClass = classifyOpenFailureMessage(retryableError); + const shouldRetry = ( + retryableErrorClass === 'lock-contention' || + retryableErrorClass === 'transient-open-race' + ); + if (!shouldRetry) { break; } + await sleep(getOpenRetryDelayMs(attempt)); + } + return null; + }; + + const opened = await tryOpenWithRetry(); + if (opened !== null) { + lastOpenStorageDiagnostics.mode = 'opfsdb'; + reportDiagnostics('opfs-open-success', { + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + return opened; + } + } + + const openedViaUri = tryOpenViaUri(); + if (openedViaUri !== null) { + lastOpenStorageDiagnostics.mode = 'uri-opfs'; + reportDiagnostics('opfs-open-success', { + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + return openedViaUri; + } + + const openedViaWasmfsPersistentPath = tryOpenWasmfsPersistent(); + if (openedViaWasmfsPersistentPath !== null) { + lastOpenStorageDiagnostics.mode = 'wasmfs-persistent'; + reportDiagnostics('opfs-open-success', { + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + return openedViaWasmfsPersistentPath; + } + + if (await ensureOpfsSahpoolVfs()) { + const openedViaSahpoolUri = tryOpenViaSahpoolUri(); + if (openedViaSahpoolUri !== null) { + lastOpenStorageDiagnostics.mode = 'uri-opfs-sahpool'; + syncCapabilityIntoDiagnostics(); + reportDiagnostics('opfs-open-success', { + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + return openedViaSahpoolUri; + } + syncCapabilityIntoDiagnostics(); + } + + if (!allowFallback) { + syncCapabilityIntoDiagnostics(); + lastOpenStorageDiagnostics.mode = 'opfs-unavailable'; + reportDiagnostics('opfs-open-failed', { + stage: 'opfs-unavailable', + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + failureClass: lastOpenStorageDiagnostics.openFailureClass, + }); + throw new Error(`OPFS is required but unavailable. diagnostics=${JSON.stringify(lastOpenStorageDiagnostics)}`); + } + + lastOpenUsedFallbackStorage = true; + try { + lastOpenStorageDiagnostics.mode = 'fallback-memory'; + reportDiagnostics('opfs-open-fallback-memory', { + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + }); + return new sqlite3.oo1.DB(':memory:', 'ct'); + } catch (e) { + lastOpenStorageDiagnostics.mode = 'fallback-memory-open-failed'; + lastOpenStorageDiagnostics.openFailureClass = classifyOpenFailureMessage(String(e)); + reportDiagnostics('opfs-open-failed', { + stage: 'fallback-memory-open-failed', + caller, + context: contextDiagnostics, + diagnostics: lastOpenStorageDiagnostics, + failureClass: lastOpenStorageDiagnostics.openFailureClass, + error: String(e), + }); + throw new Error(`Fallback in-memory database open failed. diagnostics=${JSON.stringify(lastOpenStorageDiagnostics)} error=${String(e)}`); + } +} + +/** + * @returns {boolean} + */ +export function didLastOpenUseFallbackStorage() { + return lastOpenUsedFallbackStorage; +} + +/** + * @returns {{mode: string, caller: string, runtimeContext: ReturnType|null, forceFallback: boolean, opfsReadyTimeoutMs: number, opfsReadyWait: {attempts: number, elapsedMs: number, ready: boolean}|null, hasOpfsDbCtor: boolean, hasOpfsImportDb: boolean, hasWasmfsDir: boolean, hasInstallOpfsSAHPoolVfs: boolean, hasOpfsVfs: boolean, hasOpfsSahpoolVfs: boolean, opfsVfsPtr: string|number|null, opfsSahpoolVfsPtr: string|number|null, openFailureClass: 'unsupported-opfs'|'lock-contention'|'corruption'|'transient-open-race'|'unknown'|null}} + */ +export function getLastOpenStorageDiagnostics() { + return {...lastOpenStorageDiagnostics}; +} + +/** + * @returns {Promise} + */ +export async function deleteOpfsDatabaseFiles() { + const sqlite3 = await getSqlite3(); + const allowFallback = ( + Reflect.get(globalThis, 'yomitanRequireOpfs') === false || + Reflect.get(globalThis, 'yomitanAllowSqliteMemoryFallback') === true || + typeof Reflect.get(globalThis, 'chrome') === 'undefined' + ); + const opfs = /** @type {{opfs?: SqliteOpfsApi}} */ (/** @type {unknown} */ (sqlite3)).opfs; + if (typeof opfs?.unlink !== 'function') { + reportDiagnostics('opfs-delete-files-unavailable', { + allowFallback, + context: getRuntimeContextDiagnostics(), + diagnostics: getLastOpenStorageDiagnostics(), + }); + if (allowFallback) { + return false; + } + throw new Error('OPFS unlink API is unavailable'); + } + await deleteOpfsDatabaseFilesInternal(opfs); + + return true; +} + +/** + * @param {ArrayBuffer} content + * @returns {Promise} + */ +export async function importOpfsDatabase(content) { + const sqlite3 = await getSqlite3(); + const opfs = /** @type {{opfs?: SqliteOpfsApi}} */ (/** @type {unknown} */ (sqlite3)).opfs; + if (typeof opfs?.importDb !== 'function') { + reportDiagnostics('opfs-import-db-unavailable', { + context: getRuntimeContextDiagnostics(), + diagnostics: getLastOpenStorageDiagnostics(), + contentBytes: content.byteLength, + }); + throw new Error('OPFS importDb API is unavailable'); + } + const bytes = new Uint8Array(content); + for (const dbPath of getDatabasePaths()) { + try { + await opfs.importDb(dbPath, bytes); + return; + } catch (_) { + // Try alternate path variant. + } + } + throw new Error('Failed to import OPFS database using available path variants'); +} diff --git a/ext/js/dictionary/term-bank-wasm-parser.js b/ext/js/dictionary/term-bank-wasm-parser.js new file mode 100644 index 0000000000..14dff7ab3e --- /dev/null +++ b/ext/js/dictionary/term-bank-wasm-parser.js @@ -0,0 +1,678 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {parseJson} from '../core/json.js'; + +const META_U32_FIELDS = 16; +const U8_BACKSLASH = 0x5c; +const U8_QUOTE = 0x22; +const U8_N = 0x6e; +const U8_U = 0x75; +const U8_L = 0x6c; + +const CONTENT_META_U32_FIELDS = 4; +const DEFAULT_ROW_CHUNK_SIZE = 2048; +const GLOSSARY_MEDIA_MARKER_IMAGE = new Uint8Array([0x22, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x22]); // "image" +const GLOSSARY_MEDIA_MARKER_IMG = new Uint8Array([0x22, 0x69, 0x6d, 0x67, 0x22]); // "img" +const EMPTY_UINT8_ARRAY = new Uint8Array(0); +/** @type {Promise<{memory: WebAssembly.Memory, wasm_reset_heap: () => void, wasm_alloc: (size: number) => number, parse_term_bank: (jsonPtr: number, jsonLen: number, outPtr: number, outCapacity: number) => number, encode_term_content: (jsonPtr: number, metasPtr: number, rowCount: number, outPtr: number, outCapacity: number, rowMetaPtr: number) => number}>|null} */ +let wasmPromise = null; + +/** @type {TextDecoder} */ +const textDecoder = new TextDecoder(); +/** @type {{bufferSetupMs: number, allocationMs: number, copyJsonMs: number, parseBankMs: number, encodeContentMs: number, rowDecodeMs: number, chunkDispatchMs: number, rowCount: number, chunkCount: number, chunkSize: number, minimalDecode: boolean, includeContentMetadata: boolean, copyContentBytes: boolean, reuseExpressionForReadingDecode: boolean, skipTagRuleDecode: boolean, lazyGlossaryDecode: boolean, mediaHintFastScan: boolean}|null} */ +let lastTermBankWasmParseProfile = null; +/** @type {number} */ +let lastSuccessfulMetaCapacity = 0; +/** @type {number} */ +let lastSuccessfulContentBytesPerRow = 0; + +/** + * @returns {Promise<{memory: WebAssembly.Memory, wasm_reset_heap: () => void, wasm_alloc: (size: number) => number, parse_term_bank: (jsonPtr: number, jsonLen: number, outPtr: number, outCapacity: number) => number, encode_term_content: (jsonPtr: number, metasPtr: number, rowCount: number, outPtr: number, outCapacity: number, rowMetaPtr: number) => number}>} + */ +async function getWasm() { + if (wasmPromise !== null) { + return await wasmPromise; + } + wasmPromise = (async () => { + const url = new URL('../../lib/term-bank-parser.wasm', import.meta.url); + const response = await fetch(url); + const bytes = await response.arrayBuffer(); + const instance = await WebAssembly.instantiate(bytes, {}); + const exports = /** @type {WebAssembly.Exports & {memory?: WebAssembly.Memory, wasm_reset_heap?: () => void, wasm_alloc?: (size: number) => number, parse_term_bank?: (jsonPtr: number, jsonLen: number, outPtr: number, outCapacity: number) => number, encode_term_content?: (jsonPtr: number, metasPtr: number, rowCount: number, outPtr: number, outCapacity: number, rowMetaPtr: number) => number}} */ (instance.instance.exports); + if ( + !(exports.memory instanceof WebAssembly.Memory) || + typeof exports.wasm_reset_heap !== 'function' || + typeof exports.wasm_alloc !== 'function' || + typeof exports.parse_term_bank !== 'function' || + typeof exports.encode_term_content !== 'function' + ) { + throw new Error('term-bank wasm parser exports are invalid'); + } + return { + memory: exports.memory, + wasm_reset_heap: exports.wasm_reset_heap, + wasm_alloc: exports.wasm_alloc, + parse_term_bank: exports.parse_term_bank, + encode_term_content: exports.encode_term_content, + }; + })(); + return await wasmPromise; +} + +/** + * @returns {{bufferSetupMs: number, allocationMs: number, copyJsonMs: number, parseBankMs: number, encodeContentMs: number, rowDecodeMs: number, chunkDispatchMs: number, rowCount: number, chunkCount: number, chunkSize: number, minimalDecode: boolean, includeContentMetadata: boolean, copyContentBytes: boolean, reuseExpressionForReadingDecode: boolean, skipTagRuleDecode: boolean, lazyGlossaryDecode: boolean, mediaHintFastScan: boolean}|null} + */ +export function consumeLastTermBankWasmParseProfile() { + const value = lastTermBankWasmParseProfile; + lastTermBankWasmParseProfile = null; + return value; +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @returns {string} + */ +function decodeJsonStringToken(source, start, length) { + if (length < 2 || source[start] !== U8_QUOTE || source[start + length - 1] !== U8_QUOTE) { + return ''; + } + if (length === 2) { + return ''; + } + const valueStart = start + 1; + const valueEnd = start + length - 1; + const valueBytes = source.subarray(valueStart, valueEnd); + if (!valueBytes.includes(U8_BACKSLASH)) { + return textDecoder.decode(valueBytes); + } + const quoted = textDecoder.decode(source.subarray(start, start + length)); + return /** @type {string} */ (parseJson(quoted)); +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @returns {string|null} + */ +function decodeNullableJsonStringToken(source, start, length) { + if (length === 4 && source[start] === U8_N && source[start + 1] === U8_U && source[start + 2] === U8_L && source[start + 3] === U8_L) { + return null; + } + return decodeJsonStringToken(source, start, length); +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @returns {boolean} + */ +function isNullToken(source, start, length) { + return length === 4 && source[start] === U8_N && source[start + 1] === U8_U && source[start + 2] === U8_L && source[start + 3] === U8_L; +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @param {number} fallback + * @returns {number} + */ +function decodeNumberToken(source, start, length, fallback) { + if (length <= 0) { return fallback; } + let i = start; + const end = start + length; + let sign = 1; + if (source[i] === 0x2d) { // '-' + sign = -1; + ++i; + if (i >= end) { return fallback; } + } + let value = 0; + let hasDigit = false; + for (; i < end; ++i) { + const c = source[i]; + if (c >= 0x30 && c <= 0x39) { // '0'..'9' + value = (value * 10) + (c - 0x30); + hasDigit = true; + continue; + } + const raw = textDecoder.decode(source.subarray(start, end)); + const parsed = Number.parseInt(raw, 10); + return Number.isFinite(parsed) ? parsed : fallback; + } + return hasDigit ? (sign * value) : fallback; +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @returns {string} + */ +function decodeRawToken(source, start, length) { + if (length <= 0) { return ''; } + return textDecoder.decode(source.subarray(start, start + length)); +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @param {Uint8Array} marker + * @returns {boolean} + */ +function tokenContainsMarker(source, start, length, marker) { + const markerLength = marker.length; + if (markerLength === 0 || length < markerLength) { + return false; + } + const end = start + length - markerLength; + for (let i = start; i <= end; ++i) { + let matches = true; + for (let j = 0; j < markerLength; ++j) { + if (source[i + j] !== marker[j]) { + matches = false; + break; + } + } + if (matches) { + return true; + } + } + return false; +} + +/** + * @param {Uint8Array} source + * @param {number} start + * @param {number} length + * @returns {boolean} + */ +function glossaryTokenLikelyContainsMedia(source, start, length) { + const contains = tokenContainsMarker; + return ( + contains(source, start, length, GLOSSARY_MEDIA_MARKER_IMAGE) || + contains(source, start, length, GLOSSARY_MEDIA_MARKER_IMG) + ); +} + +/** + * @param {Uint8Array} source + * @param {number} startA + * @param {number} lengthA + * @param {number} startB + * @param {number} lengthB + * @returns {boolean} + */ +function tokenBytesEqual(source, startA, lengthA, startB, lengthB) { + if (lengthA !== lengthB) { return false; } + for (let i = 0; i < lengthA; ++i) { + if (source[startA + i] !== source[startB + i]) { + return false; + } + } + return true; +} + +/** + * @param {Uint8Array} contentBytes + * @param {boolean} includeContentMetadata + * @param {number} initialMetaCapacityDivisor + * @param {number} initialContentBytesPerRow + * @returns {Promise<{heap: Uint8Array, source: Uint8Array, metas: Uint32Array, contentMetas: Uint32Array, contentOutPtr: number, rowCount: number, allocationMs: number, copyJsonMs: number, parseBankMs: number, encodeContentMs: number}>} + * @throws {Error} + */ +async function parseTermBankWasmBuffers(contentBytes, includeContentMetadata, initialMetaCapacityDivisor, initialContentBytesPerRow) { + if (contentBytes.byteLength === 0) { + return { + heap: new Uint8Array(0), + source: new Uint8Array(0), + metas: new Uint32Array(0), + contentMetas: new Uint32Array(0), + contentOutPtr: 0, + rowCount: 0, + allocationMs: 0, + copyJsonMs: 0, + parseBankMs: 0, + encodeContentMs: 0, + }; + } + const wasm = await getWasm(); + wasm.wasm_reset_heap(); + let allocationMs = 0; + let copyJsonMs = 0; + let parseBankMs = 0; + let encodeContentMs = 0; + let tStart = Date.now(); + const jsonPtr = wasm.wasm_alloc(contentBytes.byteLength); + allocationMs += Math.max(0, Date.now() - tStart); + if (jsonPtr === 0) { + throw new Error('Failed to allocate wasm json buffer'); + } + tStart = Date.now(); + new Uint8Array(wasm.memory.buffer).set(contentBytes, jsonPtr); + copyJsonMs += Math.max(0, Date.now() - tStart); + + const normalizedMetaCapacityDivisor = Number.isFinite(initialMetaCapacityDivisor) ? Math.max(8, Math.min(128, Math.trunc(initialMetaCapacityDivisor))) : 24; + let capacity = Math.max(1024, Math.floor(contentBytes.byteLength / normalizedMetaCapacityDivisor)); + if (capacity < 8192) { capacity = 8192; } + if (lastSuccessfulMetaCapacity > 0) { + capacity = Math.max(capacity, lastSuccessfulMetaCapacity); + } + let rowCount = -1; + let outPtr = 0; + for (let attempt = 0; attempt < 6; ++attempt) { + tStart = Date.now(); + outPtr = wasm.wasm_alloc(capacity * META_U32_FIELDS * 4); + allocationMs += Math.max(0, Date.now() - tStart); + if (outPtr === 0) { + throw new Error('Failed to allocate wasm term metadata buffer'); + } + tStart = Date.now(); + rowCount = wasm.parse_term_bank(jsonPtr, contentBytes.byteLength, outPtr, capacity); + parseBankMs += Math.max(0, Date.now() - tStart); + if (rowCount >= 0) { + break; + } + if (rowCount !== -2) { + throw new Error(`term-bank parser failed with code ${rowCount}`); + } + capacity *= 2; + } + if (rowCount < 0) { + throw new Error(`term-bank parser exhausted capacity (code ${rowCount})`); + } + lastSuccessfulMetaCapacity = Math.max(lastSuccessfulMetaCapacity, capacity); + + if (!includeContentMetadata) { + const heap = new Uint8Array(wasm.memory.buffer); + const metas = new Uint32Array(wasm.memory.buffer, outPtr, rowCount * META_U32_FIELDS); + const source = heap.subarray(jsonPtr, jsonPtr + contentBytes.byteLength); + return { + heap, + source, + metas, + contentMetas: new Uint32Array(0), + contentOutPtr: 0, + rowCount, + allocationMs, + copyJsonMs, + parseBankMs, + encodeContentMs, + }; + } + + tStart = Date.now(); + const contentMetaPtr = wasm.wasm_alloc(rowCount * CONTENT_META_U32_FIELDS * 4); + allocationMs += Math.max(0, Date.now() - tStart); + if (contentMetaPtr === 0) { + throw new Error('Failed to allocate wasm content metadata buffer'); + } + const normalizedInitialContentBytesPerRow = Number.isFinite(initialContentBytesPerRow) ? Math.max(16, Math.min(512, Math.trunc(initialContentBytesPerRow))) : 96; + let contentOutCapacity = Math.max(contentBytes.byteLength, rowCount * normalizedInitialContentBytesPerRow); + if (lastSuccessfulContentBytesPerRow > 0) { + contentOutCapacity = Math.max(contentOutCapacity, rowCount * lastSuccessfulContentBytesPerRow); + } + let contentOutPtr = 0; + let encodedContentBytes = -1; + for (let attempt = 0; attempt < 6; ++attempt) { + tStart = Date.now(); + contentOutPtr = wasm.wasm_alloc(contentOutCapacity); + allocationMs += Math.max(0, Date.now() - tStart); + if (contentOutPtr === 0) { + throw new Error('Failed to allocate wasm content buffer'); + } + tStart = Date.now(); + encodedContentBytes = wasm.encode_term_content( + jsonPtr, + outPtr, + rowCount, + contentOutPtr, + contentOutCapacity, + contentMetaPtr, + ); + encodeContentMs += Math.max(0, Date.now() - tStart); + if (encodedContentBytes >= 0) { + break; + } + if (encodedContentBytes !== -2) { + throw new Error(`term-content encoder failed with code ${encodedContentBytes}`); + } + contentOutCapacity *= 2; + } + if (encodedContentBytes < 0) { + throw new Error(`term-content encoder exhausted capacity (code ${encodedContentBytes})`); + } + if (rowCount > 0) { + const nextContentBytesPerRow = Math.max( + normalizedInitialContentBytesPerRow, + Math.ceil(encodedContentBytes / rowCount) + 8, + ); + lastSuccessfulContentBytesPerRow = Math.max(lastSuccessfulContentBytesPerRow, nextContentBytesPerRow); + } + + const heap = new Uint8Array(wasm.memory.buffer); + const metas = new Uint32Array(wasm.memory.buffer, outPtr, rowCount * META_U32_FIELDS); + const source = heap.subarray(jsonPtr, jsonPtr + contentBytes.byteLength); + const contentMetas = new Uint32Array(wasm.memory.buffer, contentMetaPtr, rowCount * CONTENT_META_U32_FIELDS); + return { + heap, + source, + metas, + contentMetas, + contentOutPtr, + rowCount, + allocationMs, + copyJsonMs, + parseBankMs, + encodeContentMs, + }; +} + +/** + * @param {Uint8Array} source + * @param {Uint32Array} metas + * @param {Uint32Array} contentMetas + * @param {Uint8Array} heap + * @param {number} contentOutPtr + * @param {number} version + * @param {number} i + * @param {boolean} copyContentBytes + * @param {boolean} includeContentMetadata + * @param {boolean} reuseExpressionForReadingDecode + * @param {boolean} skipTagRuleDecode + * @param {boolean} lazyGlossaryDecode + * @param {boolean} mediaHintFastScan + * @returns {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}} + */ +function decodeParsedTermRow(source, metas, contentMetas, heap, contentOutPtr, version, i, copyContentBytes, includeContentMetadata, reuseExpressionForReadingDecode, skipTagRuleDecode, lazyGlossaryDecode, mediaHintFastScan) { + const o = i * META_U32_FIELDS; + const c = i * CONTENT_META_U32_FIELDS; + const expressionStart = metas[o + 0]; + const expressionLength = metas[o + 1]; + const readingStart = metas[o + 2]; + const readingLength = metas[o + 3]; + const expression = decodeJsonStringToken(source, expressionStart, expressionLength); + const reuseExpressionReading = ( + reuseExpressionForReadingDecode && + tokenBytesEqual(source, expressionStart, expressionLength, readingStart, readingLength) + ); + const reading = reuseExpressionReading ? + expression : + decodeJsonStringToken(source, readingStart, readingLength); + const definitionTags = skipTagRuleDecode ? '' : (decodeNullableJsonStringToken(source, metas[o + 4], metas[o + 5]) ?? ''); + const rules = skipTagRuleDecode ? '' : decodeJsonStringToken(source, metas[o + 6], metas[o + 7]); + const score = decodeNumberToken(source, metas[o + 8], metas[o + 9], 0); + const glossaryStart = metas[o + 10]; + const glossaryLength = metas[o + 11]; + const glossaryJsonBytes = source.subarray(glossaryStart, glossaryStart + glossaryLength); + const glossaryJson = lazyGlossaryDecode ? '' : decodeRawToken(source, glossaryStart, glossaryLength); + const glossaryMayContainMedia = mediaHintFastScan ? glossaryTokenLikelyContainsMedia(source, glossaryStart, glossaryLength) : void 0; + const sequence = version >= 3 ? (isNullToken(source, metas[o + 12], metas[o + 13]) ? null : decodeNumberToken(source, metas[o + 12], metas[o + 13], 0)) : null; + const termTags = skipTagRuleDecode ? '' : (version >= 3 ? (decodeNullableJsonStringToken(source, metas[o + 14], metas[o + 15]) ?? '') : ''); + let termEntryContentHash1; + let termEntryContentHash2; + let termEntryContentBytes = EMPTY_UINT8_ARRAY; + if (includeContentMetadata) { + const contentOffset = contentMetas[c + 0]; + const contentLength = contentMetas[c + 1]; + const hash1 = contentMetas[c + 2]; + const hash2 = contentMetas[c + 3]; + const contentStart = contentOutPtr + contentOffset; + const contentEnd = contentStart + contentLength; + const contentSlice = heap.subarray(contentStart, contentEnd); + termEntryContentBytes = copyContentBytes ? Uint8Array.from(contentSlice) : contentSlice; + termEntryContentHash1 = hash1 >>> 0; + termEntryContentHash2 = hash2 >>> 0; + } + return { + expression, + reading, + definitionTags, + rules, + score, + glossaryJson, + glossaryJsonBytes: lazyGlossaryDecode ? glossaryJsonBytes : void 0, + glossaryMayContainMedia, + sequence, + termTags, + termEntryContentHash1, + termEntryContentHash2, + termEntryContentBytes, + }; +} + +/** + * @param {Uint8Array} source + * @param {Uint32Array} metas + * @param {Uint32Array} contentMetas + * @param {Uint8Array} heap + * @param {number} contentOutPtr + * @param {number} version + * @param {number} i + * @param {boolean} copyContentBytes + * @param {boolean} includeContentMetadata + * @param {boolean} reuseExpressionForReadingDecode + * @returns {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}} + */ +function decodeParsedTermRowMinimal(source, metas, contentMetas, heap, contentOutPtr, version, i, copyContentBytes, includeContentMetadata, reuseExpressionForReadingDecode) { + const o = i * META_U32_FIELDS; + const c = i * CONTENT_META_U32_FIELDS; + const expressionStart = metas[o + 0]; + const expressionLength = metas[o + 1]; + const readingStart = metas[o + 2]; + const readingLength = metas[o + 3]; + const expression = decodeJsonStringToken(source, expressionStart, expressionLength); + const reuseExpressionReading = ( + reuseExpressionForReadingDecode && + tokenBytesEqual(source, expressionStart, expressionLength, readingStart, readingLength) + ); + const reading = reuseExpressionReading ? + expression : + decodeJsonStringToken(source, readingStart, readingLength); + const score = decodeNumberToken(source, metas[o + 8], metas[o + 9], 0); + const sequence = version >= 3 ? (isNullToken(source, metas[o + 12], metas[o + 13]) ? null : decodeNumberToken(source, metas[o + 12], metas[o + 13], 0)) : null; + let termEntryContentHash1; + let termEntryContentHash2; + let termEntryContentBytes = EMPTY_UINT8_ARRAY; + if (includeContentMetadata) { + const contentOffset = contentMetas[c + 0]; + const contentLength = contentMetas[c + 1]; + const hash1 = contentMetas[c + 2]; + const hash2 = contentMetas[c + 3]; + const contentStart = contentOutPtr + contentOffset; + const contentEnd = contentStart + contentLength; + const contentSlice = heap.subarray(contentStart, contentEnd); + termEntryContentBytes = copyContentBytes ? Uint8Array.from(contentSlice) : contentSlice; + termEntryContentHash1 = hash1 >>> 0; + termEntryContentHash2 = hash2 >>> 0; + } + return { + expression, + reading, + definitionTags: '', + rules: '', + score, + glossaryJson: '[]', + sequence, + termTags: '', + termEntryContentHash1, + termEntryContentHash2, + termEntryContentBytes, + }; +} + +/** + * @param {Uint8Array} contentBytes + * @param {number} version + * @param {(rows: {expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[], progress: {processedRows: number, totalRows: number, chunkIndex: number, chunkCount: number}) => Promise|void} onChunk + * @param {number} [chunkSize] + * @param {{copyContentBytes?: boolean, includeContentMetadata?: boolean, initialMetaCapacityDivisor?: number, initialContentBytesPerRow?: number, minimalDecode?: boolean, reuseExpressionForReadingDecode?: boolean, skipTagRuleDecode?: boolean, lazyGlossaryDecode?: boolean, mediaHintFastScan?: boolean, preallocateChunkRows?: boolean}} [options] + * @returns {Promise} + */ +export async function parseTermBankWithWasmChunks(contentBytes, version, onChunk, chunkSize = DEFAULT_ROW_CHUNK_SIZE, options = {}) { + const copyContentBytes = options.copyContentBytes === true; + const includeContentMetadata = options.includeContentMetadata !== false; + const initialMetaCapacityDivisor = Number.isFinite(options.initialMetaCapacityDivisor) ? /** @type {number} */ (options.initialMetaCapacityDivisor) : 24; + const initialContentBytesPerRow = Number.isFinite(options.initialContentBytesPerRow) ? /** @type {number} */ (options.initialContentBytesPerRow) : 96; + const minimalDecode = options.minimalDecode === true; + const reuseExpressionForReadingDecode = options.reuseExpressionForReadingDecode === true; + const skipTagRuleDecode = options.skipTagRuleDecode === true; + const lazyGlossaryDecode = options.lazyGlossaryDecode === true; + const mediaHintFastScan = options.mediaHintFastScan === true; + const preallocateChunkRows = options.preallocateChunkRows === true; + const tBufferSetupStart = Date.now(); + const { + heap, + source, + metas, + contentMetas, + contentOutPtr, + rowCount, + allocationMs, + copyJsonMs, + parseBankMs, + encodeContentMs, + } = await parseTermBankWasmBuffers( + contentBytes, + includeContentMetadata, + initialMetaCapacityDivisor, + initialContentBytesPerRow, + ); + const bufferSetupMs = Math.max(0, Date.now() - tBufferSetupStart); + if (rowCount === 0) { + lastTermBankWasmParseProfile = { + bufferSetupMs, + allocationMs, + copyJsonMs, + parseBankMs, + encodeContentMs, + rowDecodeMs: 0, + chunkDispatchMs: 0, + rowCount: 0, + chunkCount: 0, + chunkSize: 0, + minimalDecode, + includeContentMetadata, + copyContentBytes, + reuseExpressionForReadingDecode, + skipTagRuleDecode, + lazyGlossaryDecode, + mediaHintFastScan, + }; + return; + } + const normalizedChunkSize = Number.isFinite(chunkSize) ? Math.max(1, Math.trunc(chunkSize)) : DEFAULT_ROW_CHUNK_SIZE; + const chunkCount = Math.max(1, Math.ceil(rowCount / normalizedChunkSize)); + /** + * @param {number} size + * @returns {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[]} + */ + const createRowBuffer = (size) => /** @type {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[]} */ (new Array(size)); + /** @type {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, glossaryJsonBytes?: Uint8Array, glossaryMayContainMedia?: boolean, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[]} */ + let rows = preallocateChunkRows ? createRowBuffer(Math.min(normalizedChunkSize, rowCount)) : []; + let rowsIndex = 0; + let chunkIndex = 0; + let rowDecodeMs = 0; + let chunkDispatchMs = 0; + for (let i = 0; i < rowCount; ++i) { + const tDecodeStart = Date.now(); + const row = minimalDecode ? + decodeParsedTermRowMinimal(source, metas, contentMetas, heap, contentOutPtr, version, i, copyContentBytes, includeContentMetadata, reuseExpressionForReadingDecode) : + decodeParsedTermRow(source, metas, contentMetas, heap, contentOutPtr, version, i, copyContentBytes, includeContentMetadata, reuseExpressionForReadingDecode, skipTagRuleDecode, lazyGlossaryDecode, mediaHintFastScan); + rowDecodeMs += Math.max(0, Date.now() - tDecodeStart); + if (preallocateChunkRows) { + rows[rowsIndex] = row; + ++rowsIndex; + } else { + rows.push(row); + rowsIndex = rows.length; + } + if (rowsIndex >= normalizedChunkSize) { + const chunk = rows; + rows = preallocateChunkRows ? createRowBuffer(Math.min(normalizedChunkSize, rowCount - (i + 1))) : []; + rowsIndex = 0; + ++chunkIndex; + const tDispatchStart = Date.now(); + await onChunk(chunk, { + processedRows: i + 1, + totalRows: rowCount, + chunkIndex, + chunkCount, + }); + chunkDispatchMs += Math.max(0, Date.now() - tDispatchStart); + } + } + if (rowsIndex > 0) { + if (preallocateChunkRows) { + rows.length = rowsIndex; + } + ++chunkIndex; + const tDispatchStart = Date.now(); + await onChunk(rows, { + processedRows: rowCount, + totalRows: rowCount, + chunkIndex, + chunkCount, + }); + chunkDispatchMs += Math.max(0, Date.now() - tDispatchStart); + } + lastTermBankWasmParseProfile = { + bufferSetupMs, + allocationMs, + copyJsonMs, + parseBankMs, + encodeContentMs, + rowDecodeMs, + chunkDispatchMs, + rowCount, + chunkCount, + chunkSize: normalizedChunkSize, + minimalDecode, + includeContentMetadata, + copyContentBytes, + reuseExpressionForReadingDecode, + skipTagRuleDecode, + lazyGlossaryDecode, + mediaHintFastScan, + }; +} + +/** + * @param {Uint8Array} contentBytes + * @param {number} version + * @returns {Promise<{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[]>} + */ +export async function parseTermBankWithWasm(contentBytes, version) { + /** @type {{expression: string, reading: string, definitionTags: string, rules: string, score: number, glossaryJson: string, sequence: number|null, termTags: string, termEntryContentHash1?: number, termEntryContentHash2?: number, termEntryContentBytes: Uint8Array}[]} */ + const rows = []; + await parseTermBankWithWasmChunks( + contentBytes, + version, + (chunk) => { + rows.push(...chunk); + }, + DEFAULT_ROW_CHUNK_SIZE, + {copyContentBytes: true}, + ); + return rows; +} diff --git a/ext/js/dictionary/term-content-opfs-store.js b/ext/js/dictionary/term-content-opfs-store.js new file mode 100644 index 0000000000..5095ee2e67 --- /dev/null +++ b/ext/js/dictionary/term-content-opfs-store.js @@ -0,0 +1,914 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {reportDiagnostics} from '../core/diagnostics-reporter.js'; +import {safePerformance} from '../core/safe-performance.js'; + +const FILE_NAME = 'yomitan-term-content.bin'; +const READ_PAGE_SIZE_BYTES = 64 * 1024; +const DEFAULT_READ_PAGE_CACHE_MAX_PAGES = 128; +const LOW_MEMORY_READ_PAGE_CACHE_MAX_PAGES = 48; +const HIGH_MEMORY_READ_PAGE_CACHE_MAX_PAGES = 192; +const DEFAULT_WRITE_COALESCE_TARGET_BYTES = 4 * 1024 * 1024; +const LOW_MEMORY_WRITE_COALESCE_TARGET_BYTES = 1024 * 1024; +const HIGH_MEMORY_WRITE_COALESCE_TARGET_BYTES = 16 * 1024 * 1024; +const RAW_BYTES_WRITE_COALESCE_TARGET_BYTES = 32 * 1024 * 1024; +const DEFAULT_WRITE_COALESCE_MAX_CHUNKS = 512; +const RAW_BYTES_WRITE_COALESCE_MAX_CHUNKS = 8192; +const DEFAULT_WRITE_FLUSH_THRESHOLD_BYTES = 16 * 1024 * 1024; +const LOW_MEMORY_WRITE_FLUSH_THRESHOLD_BYTES = 8 * 1024 * 1024; +const HIGH_MEMORY_WRITE_FLUSH_THRESHOLD_BYTES = 128 * 1024 * 1024; +const RAW_BYTES_WRITE_FLUSH_THRESHOLD_BYTES = 256 * 1024 * 1024; + +export class TermContentOpfsStore { + constructor() { + /** @type {FileSystemFileHandle|null} */ + this._fileHandle = null; + /** @type {FileSystemWritableFileStream|null} */ + this._writable = null; + /** @type {Uint8Array[]} */ + this._chunks = []; + /** @type {number[]} */ + this._chunkOffsets = []; + /** @type {number} */ + this._length = 0; + /** @type {number} */ + this._pendingWriteBytes = 0; + /** @type {Uint8Array[]} */ + this._pendingWriteChunks = []; + /** @type {Promise|null} */ + this._queuedWritePromise = null; + /** @type {Uint8Array[]} */ + this._queuedWriteChunks = []; + /** @type {number} */ + this._flushThresholdBytes = this._computeWriteFlushThresholdBytes(); + /** @type {boolean} */ + this._importSessionActive = false; + /** @type {boolean} */ + this._loadedForRead = false; + /** @type {File|null} */ + this._readFile = null; + /** @type {Map} */ + this._readPageCache = new Map(); + /** @type {string} */ + this._lastSliceCacheKey = ''; + /** @type {Uint8Array|null} */ + this._lastSliceCacheValue = null; + /** @type {boolean} */ + this._exactSliceCacheEnabled = true; + /** @type {number} */ + this._readPageCacheMaxPages = this._computeReadPageCacheMaxPages(); + /** @type {number} */ + this._writeCoalesceTargetBytes = this._computeWriteCoalesceTargetBytes(); + /** @type {number} */ + this._writeCoalesceMaxChunks = this._computeWriteCoalesceMaxChunks(); + /** @type {number|null} */ + this._writeCoalesceMaxChunksOverride = null; + /** @type {{flushPendingWritesMs: number, awaitQueuedWritesMs: number, closeWritableMs: number, totalMs: number, drainCycleCount: number, writeCallCount: number, singleChunkWriteCount: number, mergedWriteCount: number, totalWriteBytes: number, mergedWriteBytes: number, maxWriteBytes: number, minWriteBytes: number, mergedGroupChunkCount: number, maxMergedGroupChunkCount: number, minMergedGroupChunkCount: number, flushDueToBytesCount: number, flushDueToChunkCount: number, flushFinalGroupCount: number, writeCoalesceTargetBytes: number, writeCoalesceMaxChunks: number}|null} */ + this._lastEndImportSessionMetrics = null; + /** @type {{drainCycleCount: number, writeCallCount: number, singleChunkWriteCount: number, mergedWriteCount: number, totalWriteBytes: number, mergedWriteBytes: number, maxWriteBytes: number, minWriteBytes: number, mergedGroupChunkCount: number, maxMergedGroupChunkCount: number, minMergedGroupChunkCount: number, flushDueToBytesCount: number, flushDueToChunkCount: number, flushFinalGroupCount: number, writeCoalesceTargetBytes: number, writeCoalesceMaxChunks: number}} */ + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + /** @type {'baseline'|'raw-bytes'} */ + this._importStorageMode = 'baseline'; + } + + /** + * @returns {Promise} + */ + async prepare() { + await this._awaitQueuedWrites(); + await this._closeWritable(); + if (typeof navigator === 'undefined' || !('storage' in navigator) || !('getDirectory' in navigator.storage)) { + return; + } + const root = await navigator.storage.getDirectory(); + this._fileHandle = await root.getFileHandle(FILE_NAME, {create: true}); + const file = await this._fileHandle.getFile(); + this._length = file.size; + this._chunks = []; + this._chunkOffsets = []; + this._invalidateReadState(); + this._pendingWriteBytes = 0; + this._pendingWriteChunks = []; + this._queuedWritePromise = null; + this._queuedWriteChunks = []; + this._importSessionActive = false; + this._lastEndImportSessionMetrics = null; + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + } + + /** + * @returns {Promise} + */ + async beginImportSession() { + if (this._importSessionActive) { + return; + } + await this._awaitQueuedWrites(); + this._importSessionActive = true; + this._writeCoalesceTargetBytes = this._computeWriteCoalesceTargetBytes(); + this._writeCoalesceMaxChunks = this._computeWriteCoalesceMaxChunks(); + this._flushThresholdBytes = this._computeWriteFlushThresholdBytes(); + this._pendingWriteBytes = 0; + this._pendingWriteChunks = []; + this._queuedWritePromise = null; + this._queuedWriteChunks = []; + this._lastEndImportSessionMetrics = null; + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + if (this._fileHandle === null) { + return; + } + this._writable = await this._fileHandle.createWritable({keepExistingData: true}); + await this._writable.seek(this._length); + } + + /** + * @returns {Promise} + */ + async endImportSession() { + if (!this._importSessionActive && this._writable === null) { + return; + } + const tStart = safePerformance.now(); + this._importSessionActive = false; + const tFlushPendingWritesStart = safePerformance.now(); + await this._flushPendingWrites(); + const flushPendingWritesMs = safePerformance.now() - tFlushPendingWritesStart; + const tAwaitQueuedWritesStart = safePerformance.now(); + await this._awaitQueuedWrites(); + const awaitQueuedWritesMs = safePerformance.now() - tAwaitQueuedWritesStart; + const tCloseWritableStart = safePerformance.now(); + await this._closeWritable(); + const closeWritableMs = safePerformance.now() - tCloseWritableStart; + this._lastEndImportSessionMetrics = { + flushPendingWritesMs, + awaitQueuedWritesMs, + closeWritableMs, + totalMs: safePerformance.now() - tStart, + ...this._writeDrainMetrics, + }; + } + + /** + * @returns {{flushPendingWritesMs: number, awaitQueuedWritesMs: number, closeWritableMs: number, totalMs: number, drainCycleCount: number, writeCallCount: number, singleChunkWriteCount: number, mergedWriteCount: number, totalWriteBytes: number, mergedWriteBytes: number, maxWriteBytes: number, mergedGroupChunkCount: number, maxMergedGroupChunkCount: number, flushDueToBytesCount: number, flushDueToChunkCount: number, flushFinalGroupCount: number, writeCoalesceTargetBytes: number, writeCoalesceMaxChunks: number}|null} + */ + getLastEndImportSessionMetrics() { + return this._lastEndImportSessionMetrics; + } + + /** + * @param {'baseline'|'raw-bytes'} mode + */ + setImportStorageMode(mode) { + this._importStorageMode = mode === 'raw-bytes' ? 'raw-bytes' : 'baseline'; + this._writeCoalesceTargetBytes = this._computeWriteCoalesceTargetBytes(); + this._writeCoalesceMaxChunks = this._computeWriteCoalesceMaxChunks(); + this._flushThresholdBytes = this._computeWriteFlushThresholdBytes(); + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + } + + /** + * @param {number|null} value + */ + setWriteCoalesceMaxChunksOverride(value) { + this._writeCoalesceMaxChunksOverride = (typeof value === 'number' && Number.isFinite(value) && value > 0) ? + Math.max(1, Math.trunc(value)) : + null; + this._writeCoalesceMaxChunks = this._computeWriteCoalesceMaxChunks(); + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + } + + /** + * @param {boolean} value + */ + setExactSliceCacheEnabled(value) { + this._exactSliceCacheEnabled = value; + if (!value) { + this._lastSliceCacheKey = ''; + this._lastSliceCacheValue = null; + } + } + + /** + * @returns {Promise} + */ + async reset() { + await this._awaitQueuedWrites(); + await this._closeWritable(); + if (this._fileHandle === null) { + this._chunks = []; + this._chunkOffsets = []; + this._length = 0; + this._pendingWriteBytes = 0; + this._pendingWriteChunks = []; + this._queuedWritePromise = null; + this._queuedWriteChunks = []; + this._importSessionActive = false; + this._lastEndImportSessionMetrics = null; + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + this._invalidateReadState(); + return; + } + const writable = await this._fileHandle.createWritable(); + await writable.truncate(0); + await writable.close(); + this._chunks = []; + this._chunkOffsets = []; + this._length = 0; + this._invalidateReadState(); + this._loadedForRead = true; + this._pendingWriteBytes = 0; + this._pendingWriteChunks = []; + this._queuedWritePromise = null; + this._queuedWriteChunks = []; + this._importSessionActive = false; + this._lastEndImportSessionMetrics = null; + this._writeDrainMetrics = this._createEmptyWriteDrainMetrics(); + } + + /** + * @param {Uint8Array[]} chunks + * @returns {Promise>} + */ + async appendBatch(chunks) { + if (chunks.length === 0) { return []; } + /** @type {Array<{offset: number, length: number}>} */ + const spans = []; + /** @type {number[]} */ + const offsets = []; + /** @type {number[]} */ + const lengths = []; + this._appendBatchInternal(chunks, offsets, lengths); + for (let i = 0, ii = offsets.length; i < ii; ++i) { + spans.push({offset: offsets[i], length: lengths[i]}); + } + await this._finalizeAppendBatch(chunks); + return spans; + } + + /** + * @param {Uint8Array[]} chunks + * @param {number[]} offsets + * @param {number[]} lengths + * @returns {Promise} + */ + async appendBatchToArrays(chunks, offsets, lengths) { + if (chunks.length === 0) { return; } + this._appendBatchInternal(chunks, offsets, lengths); + await this._finalizeAppendBatch(chunks); + } + + /** + * @param {Uint8Array[]} chunks + * @param {number[]} offsets + * @param {number[]} lengths + * @returns {void} + */ + _appendBatchInternal(chunks, offsets, lengths) { + offsets.length = 0; + lengths.length = 0; + let nextOffset = this._length; + for (const chunk of chunks) { + const length = chunk.byteLength; + offsets.push(nextOffset); + lengths.push(length); + if (length > 0) { + if (this._fileHandle === null) { + this._chunkOffsets.push(nextOffset); + this._chunks.push(chunk); + } + nextOffset += length; + } + } + this._length = nextOffset; + } + + /** + * @param {Uint8Array[]} chunks + * @returns {Promise} + */ + async _finalizeAppendBatch(chunks) { + if (this._fileHandle !== null) { + let totalBytes = 0; + for (const chunk of chunks) { + totalBytes += chunk.byteLength; + } + if (totalBytes > 0) { + this._invalidateReadState(); + for (const chunk of chunks) { + if (chunk.byteLength <= 0) { continue; } + this._pendingWriteBytes += chunk.byteLength; + this._pendingWriteChunks.push(chunk); + } + if (!this._importSessionActive || this._pendingWriteBytes >= this._flushThresholdBytes) { + await this._flushPendingWrites(); + if (!this._importSessionActive) { + await this._awaitQueuedWrites(); + await this._closeWritable(); + } + } + } + } + } + + /** + * @returns {Promise} + */ + async ensureLoadedForRead() { + await this._flushPendingWrites(); + await this._awaitQueuedWrites(); + await this._closeWritable(); + if (this._loadedForRead) { return; } + if (this._fileHandle === null) { + // Non-OPFS environments rely on in-memory chunks only. + this._loadedForRead = true; + return; + } + if (this._length <= 0) { + this._loadedForRead = true; + return; + } + try { + const file = await this._fileHandle.getFile(); + this._length = file.size; + this._readFile = file; + this._readPageCache.clear(); + this._loadedForRead = true; + } catch (error) { + if (!this._isNotReadableFileError(error)) { + throw error; + } + const recovered = await this._recoverFromNotReadableFileError('ensure-loaded', error); + this._loadedForRead = recovered; + } + } + + /** + * @returns {Promise} + */ + async _flushPendingWrites() { + if (this._pendingWriteBytes <= 0 || this._pendingWriteChunks.length === 0 || this._fileHandle === null) { + return; + } + if (this._writable === null) { + this._writable = await this._fileHandle.createWritable({keepExistingData: true}); + const seekOffset = this._length - this._pendingWriteBytes; + await this._writable.seek(Math.max(0, seekOffset)); + } + const chunks = this._pendingWriteChunks; + this._pendingWriteBytes = 0; + this._pendingWriteChunks = []; + if (this._importSessionActive) { + this._queueWriteChunks(chunks); + return; + } + if (this._queuedWritePromise !== null) { + this._queueWriteChunks(chunks); + return; + } + await this._writePendingChunksCoalesced(chunks); + } + + /** + * @param {Uint8Array[]} chunks + * @returns {void} + */ + _queueWriteChunks(chunks) { + if (chunks.length === 0) { + return; + } + for (const chunk of chunks) { + this._queuedWriteChunks.push(chunk); + } + if (this._queuedWritePromise !== null) { + return; + } + this._queuedWritePromise = this._drainQueuedWrites(); + } + + /** + * @returns {Promise} + */ + async _awaitQueuedWrites() { + const promise = this._queuedWritePromise; + if (promise === null) { + return; + } + await promise; + } + + /** + * @returns {Promise} + */ + async _drainQueuedWrites() { + try { + ++this._writeDrainMetrics.drainCycleCount; + while (this._queuedWriteChunks.length > 0) { + const chunks = this._queuedWriteChunks; + this._queuedWriteChunks = []; + await this._writePendingChunksCoalesced(chunks); + } + } finally { + this._queuedWritePromise = null; + if (this._queuedWriteChunks.length > 0) { + this._queuedWritePromise = this._drainQueuedWrites(); + } + } + } + + /** + * @param {Uint8Array[]} chunks + * @returns {Promise} + */ + async _writePendingChunksCoalesced(chunks) { + if (this._writable === null) { return; } + /** @type {Uint8Array[]} */ + let group = []; + let groupBytes = 0; + const flushGroup = async (reason = 'final') => { + if (group.length === 0 || this._writable === null) { + group = []; + groupBytes = 0; + return; + } + if (reason === 'bytes') { + ++this._writeDrainMetrics.flushDueToBytesCount; + } else if (reason === 'chunks') { + ++this._writeDrainMetrics.flushDueToChunkCount; + } else { + ++this._writeDrainMetrics.flushFinalGroupCount; + } + ++this._writeDrainMetrics.writeCallCount; + this._writeDrainMetrics.totalWriteBytes += groupBytes; + if (groupBytes > this._writeDrainMetrics.maxWriteBytes) { + this._writeDrainMetrics.maxWriteBytes = groupBytes; + } + if (this._writeDrainMetrics.minWriteBytes === 0 || groupBytes < this._writeDrainMetrics.minWriteBytes) { + this._writeDrainMetrics.minWriteBytes = groupBytes; + } + if (group.length === 1) { + ++this._writeDrainMetrics.singleChunkWriteCount; + await this._writable.write(group[0]); + group = []; + groupBytes = 0; + return; + } + ++this._writeDrainMetrics.mergedWriteCount; + this._writeDrainMetrics.mergedGroupChunkCount += group.length; + if (group.length > this._writeDrainMetrics.maxMergedGroupChunkCount) { + this._writeDrainMetrics.maxMergedGroupChunkCount = group.length; + } + if ( + this._writeDrainMetrics.minMergedGroupChunkCount === 0 || + group.length < this._writeDrainMetrics.minMergedGroupChunkCount + ) { + this._writeDrainMetrics.minMergedGroupChunkCount = group.length; + } + this._writeDrainMetrics.mergedWriteBytes += groupBytes; + const merged = new Uint8Array(groupBytes); + let offset = 0; + for (const chunk of group) { + merged.set(chunk, offset); + offset += chunk.byteLength; + } + await this._writable.write(merged); + group = []; + groupBytes = 0; + }; + for (const chunk of chunks) { + if (chunk.byteLength <= 0) { continue; } + const wouldOverflow = groupBytes > 0 && (groupBytes + chunk.byteLength) > this._writeCoalesceTargetBytes; + if (wouldOverflow) { + await flushGroup('bytes'); + } + group.push(chunk); + groupBytes += chunk.byteLength; + if (group.length >= this._writeCoalesceMaxChunks || groupBytes >= this._writeCoalesceTargetBytes) { + await flushGroup(group.length >= this._writeCoalesceMaxChunks ? 'chunks' : 'bytes'); + } + } + await flushGroup(); + } + + /** + * @returns {Promise} + */ + async _closeWritable() { + if (this._writable === null) { + return; + } + try { + await this._writable.close(); + } finally { + this._writable = null; + } + } + + /** + * @param {number} offset + * @param {number} length + * @returns {Promise} + */ + async readSlice(offset, length) { + if (offset < 0 || length <= 0) { return null; } + const end = offset + length; + if (end > this._length) { return null; } + const cacheKey = `${offset}:${length}`; + if (this._exactSliceCacheEnabled && this._lastSliceCacheKey === cacheKey && this._lastSliceCacheValue instanceof Uint8Array) { + return this._lastSliceCacheValue; + } + /** @type {Uint8Array|null} */ + let result; + if (this._fileHandle === null) { + if (this._chunks.length === 0) { return null; } + result = this._readSliceFromMemory(offset, length); + } else { + if (!this._loadedForRead) { + await this.ensureLoadedForRead(); + } + result = await this._readSliceFromFile(offset, length); + } + if (this._exactSliceCacheEnabled && result instanceof Uint8Array) { + this._lastSliceCacheKey = cacheKey; + this._lastSliceCacheValue = result; + } + return result; + } + + /** + * @param {number} offset + * @param {number} length + * @returns {Uint8Array|null} + */ + _readSliceFromMemory(offset, length) { + if (this._chunks.length === 0) { return null; } + const end = offset + length; + + // Fast path for single chunk. + if (this._chunks.length === 1) { + const chunk = this._chunks[0]; + return chunk.subarray(offset, end); + } + + const output = new Uint8Array(length); + let remaining = length; + let outputOffset = 0; + let cursor = offset; + let chunkIndex = this._findChunkIndex(cursor); + while (remaining > 0 && chunkIndex < this._chunks.length) { + const chunkOffset = this._chunkOffsets[chunkIndex]; + const chunk = this._chunks[chunkIndex]; + const chunkEnd = chunkOffset + chunk.byteLength; + const startInChunk = Math.max(0, cursor - chunkOffset); + const available = chunkEnd - (chunkOffset + startInChunk); + const copyLength = Math.min(remaining, available); + if (copyLength <= 0) { + ++chunkIndex; + continue; + } + output.set(chunk.subarray(startInChunk, startInChunk + copyLength), outputOffset); + outputOffset += copyLength; + cursor += copyLength; + remaining -= copyLength; + if (cursor >= chunkEnd) { + ++chunkIndex; + } + } + return remaining === 0 ? output : null; + } + + /** + * @param {number} offset + * @param {number} length + * @returns {Promise} + */ + async _readSliceFromFile(offset, length) { + if (this._readFile === null) { + return null; + } + for (let attempt = 0; attempt < 2; ++attempt) { + try { + const output = new Uint8Array(length); + const pageSize = READ_PAGE_SIZE_BYTES; + const startPage = Math.floor(offset / pageSize); + const endPage = Math.floor((offset + length - 1) / pageSize); + let outputOffset = 0; + for (let pageIndex = startPage; pageIndex <= endPage; ++pageIndex) { + const page = await this._getReadPage(pageIndex); + if (page === null) { + return null; + } + const pageStartOffset = pageIndex * pageSize; + const rangeStart = Math.max(offset, pageStartOffset); + const rangeEnd = Math.min(offset + length, pageStartOffset + page.byteLength); + const copyLength = rangeEnd - rangeStart; + if (copyLength <= 0) { continue; } + const pageStart = rangeStart - pageStartOffset; + output.set(page.subarray(pageStart, pageStart + copyLength), outputOffset); + outputOffset += copyLength; + } + return outputOffset === length ? output : null; + } catch (error) { + if (attempt > 0 || !this._isNotReadableFileError(error)) { + throw error; + } + const recovered = await this._recoverFromNotReadableFileError('read-slice', error); + if (!recovered) { + return null; + } + } + } + return null; + } + + /** + * @param {number} pageIndex + * @returns {Promise} + */ + async _getReadPage(pageIndex) { + const cached = this._readPageCache.get(pageIndex); + if (typeof cached !== 'undefined') { + this._touchReadPage(pageIndex, cached); + return cached; + } + const file = this._readFile; + if (file === null) { + return null; + } + const pageOffset = pageIndex * READ_PAGE_SIZE_BYTES; + if (pageOffset >= this._length) { + return null; + } + const pageEnd = Math.min(this._length, pageOffset + READ_PAGE_SIZE_BYTES); + const bytes = new Uint8Array(await file.slice(pageOffset, pageEnd).arrayBuffer()); + this._setReadPage(pageIndex, bytes); + return bytes; + } + + /** + * @param {number} pageIndex + * @param {Uint8Array} page + */ + _touchReadPage(pageIndex, page) { + this._readPageCache.delete(pageIndex); + this._readPageCache.set(pageIndex, page); + } + + /** + * @param {number} pageIndex + * @param {Uint8Array} page + */ + _setReadPage(pageIndex, page) { + this._readPageCache.set(pageIndex, page); + while (this._readPageCache.size > this._readPageCacheMaxPages) { + const first = this._readPageCache.keys().next(); + if (first.done) { + break; + } + this._readPageCache.delete(first.value); + } + } + + /** + * @param {number} offset + * @returns {number} + */ + _findChunkIndex(offset) { + let low = 0; + let high = this._chunkOffsets.length - 1; + while (low <= high) { + const mid = (low + high) >>> 1; + const start = this._chunkOffsets[mid]; + const end = start + this._chunks[mid].byteLength; + if (offset < start) { + high = mid - 1; + } else if (offset >= end) { + low = mid + 1; + } else { + return mid; + } + } + return Math.max(0, Math.min(low, this._chunks.length - 1)); + } + + /** */ + _invalidateReadState() { + this._loadedForRead = false; + this._readFile = null; + this._readPageCache.clear(); + this._lastSliceCacheKey = ''; + this._lastSliceCacheValue = null; + } + + /** + * @param {unknown} error + * @returns {boolean} + */ + _isNotReadableFileError(error) { + const name = this._asErrorName(error); + if (name === 'NotReadableError') { + return true; + } + const text = this._asErrorText(error).toLowerCase(); + return ( + text.includes('notreadableerror') || + text.includes('requested file could not be read') + ); + } + + /** + * @param {unknown} error + * @returns {string} + */ + _asErrorName(error) { + return (typeof error === 'object' && error !== null && typeof Reflect.get(error, 'name') === 'string') ? + /** @type {string} */ (Reflect.get(error, 'name')) : + ''; + } + + /** + * @param {unknown} error + * @returns {string} + */ + _asErrorText(error) { + if (error instanceof Error) { + return `${error.name}: ${error.message}`; + } + const name = this._asErrorName(error); + const message = ( + typeof error === 'object' && + error !== null && + typeof Reflect.get(error, 'message') === 'string' + ) ? + /** @type {string} */ (Reflect.get(error, 'message')) : + String(error); + return name.length > 0 ? `${name}: ${message}` : message; + } + + /** + * @param {string} phase + * @param {unknown} error + * @returns {Promise} + */ + async _recoverFromNotReadableFileError(phase, error) { + reportDiagnostics('term-content-opfs-read-error', { + phase, + reason: this._asErrorText(error), + action: 'retry-open-file', + }); + this._readFile = null; + this._readPageCache.clear(); + const refreshed = await this._refreshReadFileSnapshot({reacquireHandle: false}); + if (refreshed) { + return true; + } + return await this._refreshReadFileSnapshot({reacquireHandle: true}); + } + + /** + * @param {{reacquireHandle: boolean}} options + * @returns {Promise} + */ + async _refreshReadFileSnapshot({reacquireHandle}) { + if (this._fileHandle === null) { + return false; + } + if (reacquireHandle) { + try { + if ( + typeof navigator !== 'undefined' && + 'storage' in navigator && + 'getDirectory' in navigator.storage + ) { + const root = await navigator.storage.getDirectory(); + this._fileHandle = await root.getFileHandle(FILE_NAME, {create: true}); + } + } catch (_) { + // NOP + } + } + try { + const file = await this._fileHandle.getFile(); + this._length = file.size; + this._readFile = file; + this._readPageCache.clear(); + this._loadedForRead = true; + return true; + } catch (_) { + return false; + } + } + + /** + * @returns {number} + */ + _computeReadPageCacheMaxPages() { + const memoryGiB = this._getDeviceMemoryGiB(); + if (memoryGiB !== null && memoryGiB <= 4) { + return LOW_MEMORY_READ_PAGE_CACHE_MAX_PAGES; + } + if (memoryGiB !== null && memoryGiB >= 8) { + return HIGH_MEMORY_READ_PAGE_CACHE_MAX_PAGES; + } + return DEFAULT_READ_PAGE_CACHE_MAX_PAGES; + } + + /** + * @returns {number} + */ + _computeWriteCoalesceTargetBytes() { + if (this._importStorageMode === 'raw-bytes') { + return RAW_BYTES_WRITE_COALESCE_TARGET_BYTES; + } + const memoryGiB = this._getDeviceMemoryGiB(); + if (memoryGiB !== null && memoryGiB <= 4) { + return LOW_MEMORY_WRITE_COALESCE_TARGET_BYTES; + } + if (memoryGiB !== null && memoryGiB >= 8) { + return HIGH_MEMORY_WRITE_COALESCE_TARGET_BYTES; + } + return DEFAULT_WRITE_COALESCE_TARGET_BYTES; + } + + /** + * @returns {number} + */ + _computeWriteFlushThresholdBytes() { + if (this._importStorageMode === 'raw-bytes') { + return RAW_BYTES_WRITE_FLUSH_THRESHOLD_BYTES; + } + const memoryGiB = this._getDeviceMemoryGiB(); + if (memoryGiB !== null && memoryGiB <= 4) { + return LOW_MEMORY_WRITE_FLUSH_THRESHOLD_BYTES; + } + if (memoryGiB !== null && memoryGiB >= 8) { + return HIGH_MEMORY_WRITE_FLUSH_THRESHOLD_BYTES; + } + return DEFAULT_WRITE_FLUSH_THRESHOLD_BYTES; + } + + /** + * @returns {number} + */ + _computeWriteCoalesceMaxChunks() { + if (this._writeCoalesceMaxChunksOverride !== null) { + return this._writeCoalesceMaxChunksOverride; + } + return this._importStorageMode === 'raw-bytes' ? RAW_BYTES_WRITE_COALESCE_MAX_CHUNKS : DEFAULT_WRITE_COALESCE_MAX_CHUNKS; + } + + /** + * @returns {{drainCycleCount: number, writeCallCount: number, singleChunkWriteCount: number, mergedWriteCount: number, totalWriteBytes: number, mergedWriteBytes: number, maxWriteBytes: number, minWriteBytes: number, mergedGroupChunkCount: number, maxMergedGroupChunkCount: number, minMergedGroupChunkCount: number, flushDueToBytesCount: number, flushDueToChunkCount: number, flushFinalGroupCount: number, writeCoalesceTargetBytes: number, writeCoalesceMaxChunks: number}} + */ + _createEmptyWriteDrainMetrics() { + return { + drainCycleCount: 0, + writeCallCount: 0, + singleChunkWriteCount: 0, + mergedWriteCount: 0, + totalWriteBytes: 0, + mergedWriteBytes: 0, + maxWriteBytes: 0, + minWriteBytes: 0, + mergedGroupChunkCount: 0, + maxMergedGroupChunkCount: 0, + minMergedGroupChunkCount: 0, + flushDueToBytesCount: 0, + flushDueToChunkCount: 0, + flushFinalGroupCount: 0, + writeCoalesceTargetBytes: this._writeCoalesceTargetBytes, + writeCoalesceMaxChunks: this._writeCoalesceMaxChunks, + }; + } + + /** + * @returns {number|null} + */ + _getDeviceMemoryGiB() { + /** @type {number|null} */ + let memoryGiB = null; + try { + const rawValue = /** @type {unknown} */ (Reflect.get(globalThis.navigator ?? {}, 'deviceMemory')); + if (typeof rawValue === 'number' && Number.isFinite(rawValue) && rawValue > 0) { + memoryGiB = rawValue; + } + } catch (_) { + // NOP + } + return memoryGiB; + } +} diff --git a/ext/js/dictionary/term-record-opfs-store.js b/ext/js/dictionary/term-record-opfs-store.js new file mode 100644 index 0000000000..8dbfba6484 --- /dev/null +++ b/ext/js/dictionary/term-record-opfs-store.js @@ -0,0 +1,1847 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {parseJson} from '../core/json.js'; +import {reportDiagnostics} from '../core/diagnostics-reporter.js'; +import {safePerformance} from '../core/safe-performance.js'; +import {RAW_TERM_CONTENT_DICT_NAME, RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME} from './raw-term-content.js'; +import {encodeTermRecordsWithWasm} from './term-record-wasm-encoder.js'; + +const LEGACY_FILE_NAME = 'yomitan-term-records.ndjson'; +const SHARD_DIRECTORY_NAME = 'yomitan-term-records'; +const SHARD_FILE_PREFIX = 'dict-'; +const SHARD_FILE_SUFFIX = '.mbtr'; +const BINARY_MAGIC_TEXT = 'YMTRREC9'; +const PREVIOUS_BINARY_MAGIC_TEXT = 'YMTRREC8'; +const PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT = 'YMTRREC5'; +const PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT = 'YMTRREC4'; +const PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT = 'YMTRREC3'; +const PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT = 'YMTRREC2'; +const LEGACY_BINARY_MAGIC_TEXT = 'YMTRREC1'; +const BINARY_MAGIC_BYTES = 8; +const CHUNK_HEADER_BYTES = 8; +const RECORD_HEADER_BYTES = 20; +const PREVIOUS_RECORD_HEADER_BYTES = 22; +const PREVIOUS_PREVIOUS_RECORD_HEADER_BYTES = 32; +const PREVIOUS_PREVIOUS_PREVIOUS_RECORD_HEADER_BYTES = 40; +const LEGACY_RECORD_HEADER_BYTES = 44; +const U32_NULL = 0xffffffff; +const U16_NULL = 0xffff; +const DEFAULT_FLUSH_THRESHOLD_BYTES = 32 * 1024 * 1024; +const LOW_MEMORY_FLUSH_THRESHOLD_BYTES = 16 * 1024 * 1024; +const HIGH_MEMORY_FLUSH_THRESHOLD_BYTES = 64 * 1024 * 1024; +const DEFAULT_WRITE_COALESCE_TARGET_BYTES = 4 * 1024 * 1024; +const LOW_MEMORY_WRITE_COALESCE_TARGET_BYTES = 1024 * 1024; +const HIGH_MEMORY_WRITE_COALESCE_TARGET_BYTES = 16 * 1024 * 1024; +const WRITE_COALESCE_MAX_CHUNKS = 512; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW = 0; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW_V2 = 1; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW_V3 = 2; +const ENTRY_CONTENT_DICT_NAME_CODE_JMDICT = 3; +const ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM = 0xff; +const ENTRY_CONTENT_LENGTH_U16_NULL = 0xffff; +const ENTRY_CONTENT_LENGTH_EXTENDED_U16 = 0xfffe; +const ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION = 0x8000; +const ENTRY_CONTENT_DICT_NAME_FLAG_READING_REVERSE_EQUALS_EXPRESSION_REVERSE = 0x40000000; +const ENTRY_CONTENT_DICT_NAME_FLAGS_MASK = 0x8000; +const ENTRY_CONTENT_DICT_NAME_VALUE_MASK = 0x7fff; + +/** + * @typedef {object} TermRecord + * @property {number} id + * @property {string} dictionary + * @property {string} expression + * @property {string} reading + * @property {string|null} expressionReverse + * @property {string|null} readingReverse + * @property {number} entryContentOffset + * @property {number} entryContentLength + * @property {string} entryContentDictName + * @property {number} score + * @property {number|null} sequence + */ + +/** + * @typedef {object} TermRecordShardState + * @property {string} fileName + * @property {FileSystemFileHandle} fileHandle + * @property {FileSystemWritableFileStream|null} writable + * @property {number} fileLength + * @property {number} pendingWriteBytes + * @property {Uint8Array[]} pendingWriteChunks + */ + +export class TermRecordOpfsStore { + constructor() { + /** @type {FileSystemDirectoryHandle|null} */ + this._rootDirectoryHandle = null; + /** @type {FileSystemDirectoryHandle|null} */ + this._recordsDirectoryHandle = null; + /** @type {Map} */ + this._shardStateByFileName = new Map(); + /** @type {number} */ + this._flushThresholdBytes = this._computeFlushThresholdBytes(); + /** @type {boolean} */ + this._importSessionActive = false; + /** @type {Map} */ + this._recordsById = new Map(); + /** @type {number} */ + this._nextId = 1; + /** @type {Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}>} */ + this._indexByDictionary = new Map(); + /** @type {boolean} */ + this._deferIndexBuild = false; + /** @type {boolean} */ + this._indexDirty = false; + /** @type {TextEncoder} */ + this._textEncoder = new TextEncoder(); + /** @type {TextDecoder} */ + this._textDecoder = new TextDecoder(); + /** @type {boolean} */ + this._wasmEncoderUnavailable = false; + /** @type {string[]} */ + this._invalidShardFileNames = []; + /** @type {number} */ + this._writeCoalesceTargetBytes = this._computeWriteCoalesceTargetBytes(); + } + + /** + * @returns {Promise} + */ + async prepare() { + await this._closeAllWritables(); + this._recordsById.clear(); + this._indexByDictionary.clear(); + this._nextId = 1; + this._deferIndexBuild = false; + this._indexDirty = false; + this._rootDirectoryHandle = null; + this._recordsDirectoryHandle = null; + this._shardStateByFileName.clear(); + this._invalidShardFileNames = []; + if (typeof navigator === 'undefined' || !('storage' in navigator) || !('getDirectory' in navigator.storage)) { + return; + } + const rootDirectoryHandle = await navigator.storage.getDirectory(); + this._rootDirectoryHandle = rootDirectoryHandle; + this._recordsDirectoryHandle = await rootDirectoryHandle.getDirectoryHandle(SHARD_DIRECTORY_NAME, {create: true}); + + const shardFileCount = await this._loadShardFiles(); + await (shardFileCount === 0 ? this._migrateLegacyMonolithicIfPresent() : this._deleteLegacyMonolithicIfPresent()); + await this.verifyIntegrity(); + } + + /** + * @returns {Promise} + */ + async beginImportSession() { + if (this._importSessionActive) { + return; + } + this._importSessionActive = true; + this._deferIndexBuild = true; + this._indexDirty = true; + this._indexByDictionary.clear(); + for (const state of this._shardStateByFileName.values()) { + state.pendingWriteBytes = 0; + state.pendingWriteChunks = []; + } + } + + /** + * @returns {Promise} + */ + async endImportSession() { + if (!this._importSessionActive && !this._hasPendingShardWrites()) { + return; + } + this._importSessionActive = false; + await this._flushPendingWrites(); + await this._closeAllWritables(); + this._deferIndexBuild = false; + if (this._indexDirty) { + this._indexByDictionary.clear(); + this._indexDirty = false; + } + } + + /** + * @returns {Promise} + */ + async reset() { + await this._closeAllWritables(); + this._recordsById.clear(); + this._indexByDictionary.clear(); + this._nextId = 1; + this._deferIndexBuild = false; + this._indexDirty = false; + this._shardStateByFileName.clear(); + this._invalidShardFileNames = []; + if (this._recordsDirectoryHandle === null) { + return; + } + const shardFileNames = await this._listShardFileNames(); + for (const fileName of shardFileNames) { + try { + await this._recordsDirectoryHandle.removeEntry(fileName); + } catch (_) { + // NOP + } + } + await this._deleteLegacyMonolithicIfPresent(); + } + + /** + * @returns {number} + */ + get size() { + return this._recordsById.size; + } + + /** + * @returns {boolean} + */ + isEmpty() { + return this._recordsById.size === 0; + } + + /** + * @param {{dictionary: string, expression: string, reading: string, expressionReverse: string|null, readingReverse: string|null, entryContentOffset: number, entryContentLength: number, entryContentDictName: string|null, score: number, sequence: number|null}[]} records + * @returns {Promise} + */ + async appendBatch(records) { + if (records.length === 0) { return; } + /** @type {Map} */ + const recordsByDictionary = new Map(); + for (const row of records) { + const id = this._nextId++; + const record = { + id, + dictionary: row.dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse, + readingReverse: row.readingReverse, + entryContentOffset: row.entryContentOffset, + entryContentLength: row.entryContentLength, + entryContentDictName: row.entryContentDictName ?? 'raw', + score: row.score, + sequence: row.sequence, + }; + this._recordsById.set(id, record); + const dictionaryRecords = recordsByDictionary.get(record.dictionary); + if (typeof dictionaryRecords === 'undefined') { + recordsByDictionary.set(record.dictionary, [record]); + } else { + dictionaryRecords.push(record); + } + if (!this._deferIndexBuild) { + const existingIndex = this._indexByDictionary.get(record.dictionary); + if (typeof existingIndex !== 'undefined') { + this._addRecordToDictionaryIndex(existingIndex, record); + } + } + } + if (this._deferIndexBuild) { + this._indexDirty = true; + } + for (const [dictionaryName, dictionaryRecords] of recordsByDictionary) { + const state = await this._getOrCreateShardState(dictionaryName); + if (state === null) { continue; } + await this._appendEncodedChunk(state, await this._encodeRecords(dictionaryRecords), dictionaryRecords[0]?.id ?? 0, dictionaryRecords.length); + } + } + + /** + * Fast-path append for SQL row arrays from dictionary-database bulk term insert. + * @param {unknown[][]} rows + * @param {number} start + * @param {number} count + * @returns {Promise} + */ + async appendBatchFromTermRows(rows, start, count) { + if (count <= 0) { return; } + /** @type {Map|null} */ + let recordsByDictionary = null; + /** @type {TermRecord[]} */ + const singleDictionaryRecords = []; + let singleDictionaryName = ''; + for (let i = start, ii = start + count; i < ii; ++i) { + const row = /** @type {[string, string, string, (string|null), (string|null), unknown, number, number, (string|null), unknown, unknown, unknown, number, unknown, (number|null)]} */ (rows[i]); + const id = this._nextId++; + const dictionary = row[0]; + /** @type {TermRecord} */ + const record = { + id, + dictionary, + expression: row[1], + reading: row[2], + expressionReverse: row[3], + readingReverse: row[4], + entryContentOffset: row[6], + entryContentLength: row[7], + entryContentDictName: row[8] ?? 'raw', + score: row[12], + sequence: row[14], + }; + this._recordsById.set(id, record); + if (i === start) { + singleDictionaryName = dictionary; + } + if (recordsByDictionary === null) { + if (dictionary === singleDictionaryName) { + singleDictionaryRecords.push(record); + } else { + recordsByDictionary = new Map(); + recordsByDictionary.set(singleDictionaryName, singleDictionaryRecords); + recordsByDictionary.set(dictionary, [record]); + } + } else { + let dictionaryRecords = recordsByDictionary.get(dictionary); + if (typeof dictionaryRecords === 'undefined') { + dictionaryRecords = []; + recordsByDictionary.set(dictionary, dictionaryRecords); + } + dictionaryRecords.push(record); + } + if (!this._deferIndexBuild) { + const existingIndex = this._indexByDictionary.get(dictionary); + if (typeof existingIndex !== 'undefined') { + this._addRecordToDictionaryIndex(existingIndex, record); + } + } + } + if (this._deferIndexBuild) { + this._indexDirty = true; + } + if (recordsByDictionary === null) { + const state = await this._getOrCreateShardState(singleDictionaryName); + if (state !== null) { + await this._appendEncodedChunk(state, await this._encodeRecords(singleDictionaryRecords), singleDictionaryRecords[0]?.id ?? 0, singleDictionaryRecords.length); + } + return; + } + for (const [dictionaryName, dictionaryRecords] of recordsByDictionary) { + const state = await this._getOrCreateShardState(dictionaryName); + if (state === null) { continue; } + await this._appendEncodedChunk(state, await this._encodeRecords(dictionaryRecords), dictionaryRecords[0]?.id ?? 0, dictionaryRecords.length); + } + } + + /** + * Fast-path append for importer DatabaseTermEntry arrays paired with resolved content refs. + * @param {unknown[]} rows + * @param {number} start + * @param {number} count + * @param {number[]} contentOffsets + * @param {number[]} contentLengths + * @param {(string|null)[]} contentDictNames + * @returns {Promise<{buildRecordsMs: number, encodeMs: number, appendWriteMs: number}>} + */ + async appendBatchFromResolvedImportTermEntries(rows, start, count, contentOffsets, contentLengths, contentDictNames) { + if (count <= 0) { return {buildRecordsMs: 0, encodeMs: 0, appendWriteMs: 0}; } + if (contentOffsets.length < (start + count) || contentLengths.length < (start + count) || contentDictNames.length < (start + count)) { + throw new Error('appendBatchFromResolvedImportTermEntries content refs length is smaller than row count'); + } + const tBuildStart = safePerformance.now(); + let buildRecordsMs = 0; + let encodeMs = 0; + let appendWriteMs = 0; + /** @type {Map|null} */ + let recordsByDictionary = null; + /** @type {TermRecord[]} */ + const singleDictionaryRecords = new Array(count); + let singleDictionaryRecordCount = 0; + let singleDictionaryName = ''; + for (let i = start, ii = start + count; i < ii; ++i) { + const row = /** @type {{dictionary: string, expression: string, reading: string, expressionReverse?: string, readingReverse?: string, score: number, sequence?: number}} */ (rows[i]); + const id = this._nextId++; + const dictionary = row.dictionary; + /** @type {TermRecord} */ + const record = { + id, + dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse ?? null, + readingReverse: row.readingReverse ?? null, + entryContentOffset: contentOffsets[i], + entryContentLength: contentLengths[i], + entryContentDictName: contentDictNames[i] ?? 'raw', + score: row.score, + sequence: typeof row.sequence === 'number' ? row.sequence : null, + }; + this._recordsById.set(id, record); + if (i === start) { + singleDictionaryName = dictionary; + } + if (recordsByDictionary === null) { + if (dictionary === singleDictionaryName) { + singleDictionaryRecords[singleDictionaryRecordCount++] = record; + } else { + recordsByDictionary = new Map(); + recordsByDictionary.set(singleDictionaryName, singleDictionaryRecords.slice(0, singleDictionaryRecordCount)); + recordsByDictionary.set(dictionary, [record]); + } + } else { + let dictionaryRecords = recordsByDictionary.get(dictionary); + if (typeof dictionaryRecords === 'undefined') { + dictionaryRecords = []; + recordsByDictionary.set(dictionary, dictionaryRecords); + } + dictionaryRecords.push(record); + } + if (!this._deferIndexBuild) { + const existingIndex = this._indexByDictionary.get(dictionary); + if (typeof existingIndex !== 'undefined') { + this._addRecordToDictionaryIndex(existingIndex, record); + } + } + } + if (this._deferIndexBuild) { + this._indexDirty = true; + } + buildRecordsMs = safePerformance.now() - tBuildStart; + if (recordsByDictionary === null) { + const state = await this._getOrCreateShardState(singleDictionaryName); + if (state !== null) { + const metrics = await this._encodeAndAppendChunkForState(state, singleDictionaryRecords); + encodeMs += metrics.encodeMs; + appendWriteMs += metrics.appendWriteMs; + } + return {buildRecordsMs, encodeMs, appendWriteMs}; + } + for (const [dictionaryName, dictionaryRecords] of recordsByDictionary) { + const state = await this._getOrCreateShardState(dictionaryName); + if (state === null) { continue; } + const metrics = await this._encodeAndAppendChunkForState(state, dictionaryRecords); + encodeMs += metrics.encodeMs; + appendWriteMs += metrics.appendWriteMs; + } + return {buildRecordsMs, encodeMs, appendWriteMs}; + } + + /** + * Fast-path append for importer DatabaseTermEntry arrays paired with content spans. + * @param {unknown[]} rows + * @param {number} start + * @param {number} count + * @param {{offset: number, length: number}[]} spans + * @returns {Promise} + */ + async appendBatchFromImportTermEntries(rows, start, count, spans) { + if (count <= 0) { return; } + if (spans.length < count) { + throw new Error('appendBatchFromImportTermEntries spans length is smaller than row count'); + } + /** @type {Map|null} */ + let recordsByDictionary = null; + /** @type {TermRecord[]} */ + const singleDictionaryRecords = []; + let singleDictionaryName = ''; + for (let i = 0; i < count; ++i) { + const row = /** @type {{dictionary: string, expression: string, reading: string, expressionReverse?: string, readingReverse?: string, score: number, sequence?: number}} */ (rows[start + i]); + const span = spans[i]; + const id = this._nextId++; + const dictionary = row.dictionary; + /** @type {TermRecord} */ + const record = { + id, + dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse ?? null, + readingReverse: row.readingReverse ?? null, + entryContentOffset: span.offset, + entryContentLength: span.length, + entryContentDictName: 'raw', + score: row.score, + sequence: typeof row.sequence === 'number' ? row.sequence : null, + }; + this._recordsById.set(id, record); + if (i === 0) { + singleDictionaryName = dictionary; + } + if (recordsByDictionary === null) { + if (dictionary === singleDictionaryName) { + singleDictionaryRecords.push(record); + } else { + recordsByDictionary = new Map(); + recordsByDictionary.set(singleDictionaryName, singleDictionaryRecords); + recordsByDictionary.set(dictionary, [record]); + } + } else { + let dictionaryRecords = recordsByDictionary.get(dictionary); + if (typeof dictionaryRecords === 'undefined') { + dictionaryRecords = []; + recordsByDictionary.set(dictionary, dictionaryRecords); + } + dictionaryRecords.push(record); + } + if (!this._deferIndexBuild) { + const existingIndex = this._indexByDictionary.get(dictionary); + if (typeof existingIndex !== 'undefined') { + this._addRecordToDictionaryIndex(existingIndex, record); + } + } + } + if (this._deferIndexBuild) { + this._indexDirty = true; + } + if (recordsByDictionary === null) { + const state = await this._getOrCreateShardState(singleDictionaryName); + if (state !== null) { + await this._encodeAndAppendChunkForState(state, singleDictionaryRecords); + } + return; + } + for (const [dictionaryName, dictionaryRecords] of recordsByDictionary) { + const state = await this._getOrCreateShardState(dictionaryName); + if (state === null) { continue; } + await this._encodeAndAppendChunkForState(state, dictionaryRecords); + } + } + + /** + * Fast-path append for importer DatabaseTermEntry arrays paired with raw content offset/length arrays. + * @param {unknown[]} rows + * @param {number} start + * @param {number} count + * @param {number[]} contentOffsets + * @param {number[]} contentLengths + * @param {string|null} [contentDictName='raw'] + * @returns {Promise<{buildRecordsMs: number, encodeMs: number, appendWriteMs: number}>} + */ + async appendBatchFromImportTermEntriesResolvedContent(rows, start, count, contentOffsets, contentLengths, contentDictName = 'raw') { + if (count <= 0) { return {buildRecordsMs: 0, encodeMs: 0, appendWriteMs: 0}; } + if (contentOffsets.length < count || contentLengths.length < count) { + throw new Error('appendBatchFromImportTermEntriesResolvedContent content arrays are smaller than row count'); + } + const tBuildStart = safePerformance.now(); + let buildRecordsMs = 0; + let encodeMs = 0; + let appendWriteMs = 0; + /** @type {Map|null} */ + let recordsByDictionary = null; + /** @type {TermRecord[]} */ + const singleDictionaryRecords = new Array(count); + let singleDictionaryRecordCount = 0; + let firstDictionaryName = ''; + for (let i = 0; i < count; ++i) { + const row = /** @type {{dictionary: string, expression: string, reading: string, expressionReverse?: string, readingReverse?: string, score: number, sequence?: number}} */ (rows[start + i]); + const id = this._nextId++; + const dictionary = row.dictionary; + /** @type {TermRecord} */ + const record = { + id, + dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse ?? null, + readingReverse: row.readingReverse ?? null, + entryContentOffset: contentOffsets[i], + entryContentLength: contentLengths[i], + entryContentDictName: contentDictName ?? 'raw', + score: row.score, + sequence: typeof row.sequence === 'number' ? row.sequence : null, + }; + this._recordsById.set(id, record); + if (i === 0) { + firstDictionaryName = dictionary; + } + if (recordsByDictionary === null) { + if (dictionary === firstDictionaryName) { + singleDictionaryRecords[singleDictionaryRecordCount++] = record; + } else { + recordsByDictionary = new Map(); + recordsByDictionary.set(firstDictionaryName, singleDictionaryRecords.slice(0, singleDictionaryRecordCount)); + recordsByDictionary.set(dictionary, [record]); + } + } else { + let dictionaryRecords = recordsByDictionary.get(dictionary); + if (typeof dictionaryRecords === 'undefined') { + dictionaryRecords = []; + recordsByDictionary.set(dictionary, dictionaryRecords); + } + dictionaryRecords.push(record); + } + if (!this._deferIndexBuild) { + const existingIndex = this._indexByDictionary.get(dictionary); + if (typeof existingIndex !== 'undefined') { + this._addRecordToDictionaryIndex(existingIndex, record); + } + } + } + if (this._deferIndexBuild) { + this._indexDirty = true; + } + buildRecordsMs = safePerformance.now() - tBuildStart; + if (recordsByDictionary === null) { + const state = await this._getOrCreateShardState(firstDictionaryName); + if (state !== null) { + const metrics = await this._encodeAndAppendChunkForState(state, singleDictionaryRecords); + encodeMs += metrics.encodeMs; + appendWriteMs += metrics.appendWriteMs; + } + return {buildRecordsMs, encodeMs, appendWriteMs}; + } + for (const [dictionaryName, dictionaryRecords] of recordsByDictionary) { + const state = await this._getOrCreateShardState(dictionaryName); + if (state === null) { continue; } + const metrics = await this._encodeAndAppendChunkForState(state, dictionaryRecords); + encodeMs += metrics.encodeMs; + appendWriteMs += metrics.appendWriteMs; + } + return {buildRecordsMs, encodeMs, appendWriteMs}; + } + + /** + * @param {TermRecordShardState} state + * @param {TermRecord[]} records + * @returns {Promise<{encodeMs: number, appendWriteMs: number}>} + */ + async _encodeAndAppendChunkForState(state, records) { + const tEncodeStart = safePerformance.now(); + const chunk = await this._encodeRecords(records); + const encodeMs = safePerformance.now() - tEncodeStart; + const tAppendStart = safePerformance.now(); + await this._appendEncodedChunk(state, chunk, records[0]?.id ?? 0, records.length); + const appendWriteMs = safePerformance.now() - tAppendStart; + return {encodeMs, appendWriteMs}; + } + + /** + * @param {string} dictionaryName + * @returns {Promise} + */ + async deleteByDictionary(dictionaryName) { + let deletedCount = 0; + const ids = [...this._recordsById.keys()]; + for (const id of ids) { + const record = this._recordsById.get(id); + if (typeof record === 'undefined' || record.dictionary !== dictionaryName) { continue; } + this._recordsById.delete(id); + ++deletedCount; + } + this._indexByDictionary.delete(dictionaryName); + await this._deleteShardByDictionary(dictionaryName); + return deletedCount; + } + + /** + * @param {Iterable} ids + * @returns {Map} + */ + getByIds(ids) { + /** @type {Map} */ + const result = new Map(); + for (const id of ids) { + const record = this._recordsById.get(id); + if (typeof record !== 'undefined') { + result.set(id, record); + } + } + return result; + } + + /** + * @returns {number[]} + */ + getAllIds() { + return [...this._recordsById.keys()].sort((a, b) => a - b); + } + + /** + * @param {string} dictionaryName + * @returns {{expression: Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}} + */ + getDictionaryIndex(dictionaryName) { + this._ensureIndexesReady(); + const existing = this._indexByDictionary.get(dictionaryName); + if (typeof existing !== 'undefined') { + if ( + existing.expression.size === 0 && + existing.reading.size === 0 && + this._hasRecordsForDictionary(dictionaryName) + ) { + this._indexByDictionary.delete(dictionaryName); + } else { + return existing; + } + } + const created = { + expression: new Map(), + reading: new Map(), + expressionReverse: new Map(), + readingReverse: new Map(), + pair: new Map(), + sequence: new Map(), + }; + for (const record of this._recordsById.values()) { + if (record.dictionary !== dictionaryName) { continue; } + this._addRecordToDictionaryIndex(created, record); + } + if ( + created.expression.size === 0 && + created.reading.size === 0 && + this._hasRecordsForDictionary(dictionaryName) + ) { + this._rebuildIndexesFromRecords(); + const rebuilt = this._indexByDictionary.get(dictionaryName); + if (typeof rebuilt !== 'undefined') { + return rebuilt; + } + } + this._indexByDictionary.set(dictionaryName, created); + return created; + } + + /** + * @param {string} dictionaryName + * @returns {boolean} + */ + _hasRecordsForDictionary(dictionaryName) { + for (const record of this._recordsById.values()) { + if (record.dictionary === dictionaryName) { + return true; + } + } + return false; + } + + /** + * @param {number} id + * @returns {TermRecord|undefined} + */ + getById(id) { + return this._recordsById.get(id); + } + + /** + * @param {string[]|null} [expectedDictionaryNames] + * @returns {Promise<{ + * expectedShardCount: number, + * actualShardCount: number, + * missingShardCount: number, + * missingShardFileNames: string[], + * missingDictionaryNames: string[], + * orphanShardCount: number, + * orphanShardFileNames: string[], + * orphanDictionaryNames: string[], + * removedOrphanShardCount: number, + * invalidShardPayloadCount: number, + * invalidShardFileNames: string[], + * rewroteAllShardsFromMemory: boolean + * }>} + */ + async verifyIntegrity(expectedDictionaryNames = null) { + /** @type {Set} */ + const expectedFileNames = new Set(); + /** @type {Set} */ + const expectedFileNamesFromRecords = new Set(); + for (const record of this._recordsById.values()) { + const fileName = this._getShardFileName(record.dictionary); + expectedFileNames.add(fileName); + expectedFileNamesFromRecords.add(fileName); + } + if (Array.isArray(expectedDictionaryNames)) { + for (const dictionaryName of expectedDictionaryNames) { + if (typeof dictionaryName !== 'string' || dictionaryName.length === 0) { continue; } + expectedFileNames.add(this._getShardFileName(dictionaryName)); + } + } + + /** @type {string[]} */ + const missingShardFileNames = []; + /** @type {string[]} */ + const orphanShardFileNames = []; + for (const fileName of expectedFileNames) { + if (!this._shardStateByFileName.has(fileName)) { + missingShardFileNames.push(fileName); + } + } + for (const fileName of this._shardStateByFileName.keys()) { + if (!expectedFileNames.has(fileName)) { + orphanShardFileNames.push(fileName); + } + } + + let removedOrphanShardCount = 0; + for (const fileName of orphanShardFileNames) { + if (this._recordsDirectoryHandle !== null) { + try { + await this._recordsDirectoryHandle.removeEntry(fileName); + ++removedOrphanShardCount; + } catch (_) { + // NOP + } + } + this._shardStateByFileName.delete(fileName); + } + + let rewroteAllShardsFromMemory = false; + let shouldRewriteFromMemory = false; + for (const fileName of missingShardFileNames) { + if (expectedFileNamesFromRecords.has(fileName)) { + shouldRewriteFromMemory = true; + break; + } + } + if (shouldRewriteFromMemory) { + await this._rewriteAllShardsFromMemory(); + rewroteAllShardsFromMemory = true; + } + + const missingDictionaryNames = missingShardFileNames + .map((fileName) => this._decodeDictionaryNameFromShardFileName(fileName)) + .filter((value) => typeof value === 'string'); + const orphanDictionaryNames = orphanShardFileNames + .map((fileName) => this._decodeDictionaryNameFromShardFileName(fileName)) + .filter((value) => typeof value === 'string'); + + const summary = { + expectedShardCount: expectedFileNames.size, + actualShardCount: this._shardStateByFileName.size, + missingShardCount: missingShardFileNames.length, + missingShardFileNames: [...missingShardFileNames].sort(), + missingDictionaryNames: [...new Set(missingDictionaryNames)].sort(), + orphanShardCount: orphanShardFileNames.length, + orphanShardFileNames: [...orphanShardFileNames].sort(), + orphanDictionaryNames: [...new Set(orphanDictionaryNames)].sort(), + removedOrphanShardCount, + invalidShardPayloadCount: this._invalidShardFileNames.length, + invalidShardFileNames: [...this._invalidShardFileNames].sort(), + rewroteAllShardsFromMemory, + }; + reportDiagnostics('term-record-shard-integrity-summary', summary); + return summary; + } + + /** + * @param {Uint8Array} content + * @returns {boolean} + */ + _isBinaryFormat(content) { + if (content.byteLength < BINARY_MAGIC_BYTES) { + return false; + } + const magic = this._textDecoder.decode(content.subarray(0, BINARY_MAGIC_BYTES)); + return ( + magic === BINARY_MAGIC_TEXT || + magic === PREVIOUS_BINARY_MAGIC_TEXT || + magic === PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT || + magic === PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT || + magic === PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT || + magic === PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT || + magic === LEGACY_BINARY_MAGIC_TEXT + ); + } + + /** + * @param {Uint8Array} content + * @param {string|null} shardDictionaryName + */ + _loadBinary(content, shardDictionaryName = null) { + const view = new DataView(content.buffer, content.byteOffset, content.byteLength); + const magic = this._textDecoder.decode(content.subarray(0, BINARY_MAGIC_BYTES)); + const isLegacy = magic === LEGACY_BINARY_MAGIC_TEXT; + const isCurrent = magic === BINARY_MAGIC_TEXT; + const isPrevious = magic === PREVIOUS_BINARY_MAGIC_TEXT; + const isPreviousPrevious = magic === PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT; + const isPreviousPreviousPrevious = magic === PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT; + const isPreviousPreviousPreviousPrevious = magic === PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT; + const isPreviousPreviousPreviousPreviousPrevious = magic === PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_PREVIOUS_BINARY_MAGIC_TEXT; + let recordHeaderBytes; + if (isLegacy) { + recordHeaderBytes = LEGACY_RECORD_HEADER_BYTES; + } else if (isCurrent) { + recordHeaderBytes = RECORD_HEADER_BYTES; + } else if (isPrevious) { + recordHeaderBytes = PREVIOUS_RECORD_HEADER_BYTES; + } else if (isPreviousPrevious) { + recordHeaderBytes = PREVIOUS_PREVIOUS_RECORD_HEADER_BYTES; + } else { + recordHeaderBytes = PREVIOUS_PREVIOUS_PREVIOUS_RECORD_HEADER_BYTES; + } + let cursor = BINARY_MAGIC_BYTES; + while (true) { + let chunkBaseId = 0; + let chunkCount = 0; + if (isCurrent) { + if ((cursor + CHUNK_HEADER_BYTES) > content.byteLength) { break; } + chunkBaseId = view.getUint32(cursor, true); cursor += 4; + chunkCount = view.getUint32(cursor, true); cursor += 4; + if (chunkBaseId <= 0 || chunkCount === 0) { break; } + } else { + if ((cursor + recordHeaderBytes) > content.byteLength) { break; } + chunkCount = 1; + } + for (let chunkIndex = 0; chunkIndex < chunkCount; ++chunkIndex) { + if ((cursor + recordHeaderBytes) > content.byteLength) { return; } + const id = isCurrent ? (chunkBaseId + chunkIndex) : view.getUint32(cursor, true); + if (!isCurrent) { cursor += 4; } + const dictionaryLength = isLegacy ? view.getUint32(cursor, true) : 0; cursor += isLegacy ? 4 : 0; + const expressionLength = (isCurrent || isPrevious) ? view.getUint16(cursor, true) : view.getUint32(cursor, true); cursor += (isCurrent || isPrevious) ? 2 : 4; + const readingLength = (isCurrent || isPrevious) ? view.getUint16(cursor, true) : view.getUint32(cursor, true); cursor += (isCurrent || isPrevious) ? 2 : 4; + const rawExpressionReverseLength = ( + (isCurrent || isPrevious) ? + U16_NULL : + ((isPreviousPrevious || isPreviousPreviousPrevious) ? view.getUint16(cursor, true) : view.getUint32(cursor, true)) + ); + if (!(isCurrent || isPrevious)) { cursor += (isPreviousPrevious || isPreviousPreviousPrevious) ? 2 : 4; } + const rawReadingReverseLength = ( + (isCurrent || isPrevious) ? + U16_NULL : + ((isPreviousPrevious || isPreviousPreviousPrevious) ? view.getUint16(cursor, true) : view.getUint32(cursor, true)) + ); + if (!(isCurrent || isPrevious)) { cursor += (isPreviousPrevious || isPreviousPreviousPrevious) ? 2 : 4; } + const expressionReverseLength = ( + (isCurrent || isPrevious || isPreviousPrevious || isPreviousPreviousPrevious) ? + (rawExpressionReverseLength === U16_NULL ? -1 : rawExpressionReverseLength) : + (rawExpressionReverseLength === U32_NULL ? -1 : /** @type {number} */ (rawExpressionReverseLength)) + ); + const readingReverseLength = ( + (isCurrent || isPrevious || isPreviousPrevious || isPreviousPreviousPrevious) ? + (rawReadingReverseLength === U16_NULL ? -1 : rawReadingReverseLength) : + (rawReadingReverseLength === U32_NULL ? -1 : /** @type {number} */ (rawReadingReverseLength)) + ); + const rawEntryContentOffset = view.getUint32(cursor, true); cursor += 4; + let rawEntryContentLength; + if (isCurrent) { + const compactEntryContentLength = view.getUint16(cursor, true); cursor += 2; + if (compactEntryContentLength === ENTRY_CONTENT_LENGTH_U16_NULL) { + rawEntryContentLength = U32_NULL; + } else if (compactEntryContentLength === ENTRY_CONTENT_LENGTH_EXTENDED_U16) { + rawEntryContentLength = view.getUint32(cursor, true); cursor += 4; + } else { + rawEntryContentLength = compactEntryContentLength; + } + } else { + rawEntryContentLength = view.getUint32(cursor, true); cursor += 4; + } + const entryContentDictNameMeta16 = isCurrent ? view.getUint16(cursor, true) : view.getUint32(cursor, true); cursor += isCurrent ? 2 : 4; + const entryContentDictNameMeta = (isCurrent && entryContentDictNameMeta16 === U16_NULL) ? view.getUint32(cursor, true) : entryContentDictNameMeta16; + if (isCurrent && entryContentDictNameMeta16 === U16_NULL) { cursor += 4; } + const entryContentDictNameFlags = (isCurrent || isPrevious) ? (entryContentDictNameMeta & ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) : 0; + const entryContentDictNameValue = (isCurrent || isPrevious) ? (entryContentDictNameMeta & ENTRY_CONTENT_DICT_NAME_VALUE_MASK) : entryContentDictNameMeta; + const score = view.getInt32(cursor, true); cursor += 4; + const rawSequence = view.getInt32(cursor, true); cursor += 4; + let entryContentDictNameLength = 0; + if (isLegacy || isPrevious || isPreviousPrevious || isPreviousPreviousPrevious || isPreviousPreviousPreviousPrevious || isPreviousPreviousPreviousPreviousPrevious) { + entryContentDictNameLength = entryContentDictNameMeta; + } else if ((entryContentDictNameValue & 0xff) === ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM) { + entryContentDictNameLength = entryContentDictNameValue >>> 8; + } + + const requiredBytes = + dictionaryLength + + expressionLength + + readingLength + + ( + (isCurrent || isPrevious) ? + 0 : + ( + Math.max(0, expressionReverseLength) + + Math.max(0, readingReverseLength) + ) + ) + + entryContentDictNameLength; + if ((cursor + requiredBytes) > content.byteLength || id <= 0) { + return; + } + + const dictionary = isLegacy ? this._decodeString(content, cursor, dictionaryLength) : shardDictionaryName; + if (isLegacy) { cursor += dictionaryLength; } + if (dictionary === null) { return; } + const expression = this._decodeString(content, cursor, expressionLength); cursor += expressionLength; + const reading = ( + (isCurrent || isPrevious) && (entryContentDictNameFlags & ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION) !== 0 ? + expression : + this._decodeString(content, cursor, readingLength) + ); + if (!((isCurrent || isPrevious) && (entryContentDictNameFlags & ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION) !== 0)) { + cursor += readingLength; + } + let expressionReverse; + let readingReverse; + if (isCurrent || isPrevious) { + expressionReverse = this._reverseString(expression); + readingReverse = reading === expression ? expressionReverse : this._reverseString(reading); + } else { + expressionReverse = expressionReverseLength >= 0 ? this._decodeString(content, cursor, expressionReverseLength) : null; + if (expressionReverseLength >= 0) { cursor += expressionReverseLength; } + readingReverse = ( + readingReverseLength >= 0 ? + ( + isPreviousPrevious && (entryContentDictNameFlags & ENTRY_CONTENT_DICT_NAME_FLAG_READING_REVERSE_EQUALS_EXPRESSION_REVERSE) !== 0 ? + expressionReverse : + this._decodeString(content, cursor, readingReverseLength) + ) : + null + ); + if (readingReverseLength >= 0 && !(isPreviousPrevious && (entryContentDictNameFlags & ENTRY_CONTENT_DICT_NAME_FLAG_READING_REVERSE_EQUALS_EXPRESSION_REVERSE) !== 0)) { + cursor += readingReverseLength; + } + } + const entryContentDictName = (isLegacy || isPrevious || isPreviousPrevious || isPreviousPreviousPrevious || isPreviousPreviousPreviousPrevious || isPreviousPreviousPreviousPreviousPrevious) ? + this._decodeString(content, cursor, entryContentDictNameLength) : + this._decodeEntryContentDictName(entryContentDictNameValue, content, cursor, entryContentDictNameLength); + cursor += entryContentDictNameLength; + + const record = { + id, + dictionary, + expression, + reading, + expressionReverse, + readingReverse, + entryContentOffset: rawEntryContentOffset === U32_NULL ? -1 : rawEntryContentOffset, + entryContentLength: rawEntryContentLength === U32_NULL ? -1 : rawEntryContentLength, + entryContentDictName, + score, + sequence: rawSequence >= 0 ? rawSequence : null, + }; + this._recordsById.set(id, record); + if (id >= this._nextId) { + this._nextId = id + 1; + } + } + } + } + + /** + * @param {number} meta + * @param {Uint8Array} content + * @param {number} offset + * @param {number} customLength + * @returns {string} + */ + _decodeEntryContentDictName(meta, content, offset, customLength) { + switch (meta & 0xff) { + case ENTRY_CONTENT_DICT_NAME_CODE_RAW: + return 'raw'; + case ENTRY_CONTENT_DICT_NAME_CODE_RAW_V2: + return RAW_TERM_CONTENT_DICT_NAME; + case ENTRY_CONTENT_DICT_NAME_CODE_RAW_V3: + return RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME; + case ENTRY_CONTENT_DICT_NAME_CODE_JMDICT: + return 'jmdict'; + case ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM: + return this._decodeString(content, offset, customLength); + default: + return 'raw'; + } + } + + /** + * @param {string} value + * @returns {{meta: number, bytes: Uint8Array|null}} + */ + _encodeEntryContentDictNameMeta(value) { + switch (value) { + case '': + case 'raw': + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW, bytes: null}; + case RAW_TERM_CONTENT_DICT_NAME: + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW_V2, bytes: null}; + case RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME: + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW_V3, bytes: null}; + case 'jmdict': + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_JMDICT, bytes: null}; + default: { + const bytes = this._textEncoder.encode(value); + return {meta: (((bytes.byteLength >>> 0) << 8) | ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM) >>> 0, bytes}; + } + } + } + + /** + * @param {Uint8Array} content + */ + _loadLegacyNdjson(content) { + const text = this._textDecoder.decode(content); + for (const line of text.split('\n')) { + if (line.length === 0) { continue; } + let raw; + try { + raw = /** @type {unknown[]} */ (parseJson(line)); + } catch (_) { + continue; + } + if (!Array.isArray(raw) || raw.length < 11) { continue; } + const id = this._asNumber(raw[0], -1); + if (id <= 0) { continue; } + const record = { + id, + dictionary: this._asString(raw[1]), + expression: this._asString(raw[2]), + reading: this._asString(raw[3]), + expressionReverse: this._asNullableString(raw[4]), + readingReverse: this._asNullableString(raw[5]), + entryContentOffset: this._asNumber(raw[6], -1), + entryContentLength: this._asNumber(raw[7], -1), + entryContentDictName: this._asString(raw[8]), + score: this._asNumber(raw[9], 0), + sequence: this._asNullableNumber(raw[10]), + }; + this._recordsById.set(id, record); + if (id >= this._nextId) { + this._nextId = id + 1; + } + } + } + + /** + * @param {Uint8Array} content + * @param {number} offset + * @param {number} length + * @returns {string} + */ + _decodeString(content, offset, length) { + if (length <= 0) { + return ''; + } + return this._textDecoder.decode(content.subarray(offset, offset + length)); + } + + /** + * @param {string} value + * @returns {string} + */ + _reverseString(value) { + let result = ''; + for (let i = value.length - 1; i >= 0; --i) { + const c = value.charCodeAt(i); + if ( + (c & 0xfc00) === 0xdc00 && + i > 0 + ) { + const c2 = value.charCodeAt(i - 1); + if ((c2 & 0xfc00) === 0xd800) { + result += value[i - 1] + value[i]; + --i; + continue; + } + } + result += value[i]; + } + return result; + } + + /** + * @param {TermRecord[]} records + * @returns {Promise} + */ + async _encodeRecords(records) { + if (records.length === 0) { + return new Uint8Array(0); + } + if (!this._wasmEncoderUnavailable) { + try { + const encoded = await encodeTermRecordsWithWasm(records, this._textEncoder); + if (encoded instanceof Uint8Array) { + return encoded; + } + } catch (_) { + this._wasmEncoderUnavailable = true; + } + } + /** @type {Array<{record: TermRecord, expressionBytes: Uint8Array, readingBytes: Uint8Array, entryContentDictNameMeta: number, entryContentDictNameBytes: Uint8Array|null}>} */ + const encodedRows = []; + let totalBytes = 0; + for (const record of records) { + const expressionBytes = this._textEncoder.encode(record.expression); + const readingBytes = this._textEncoder.encode(record.reading); + const {meta: entryContentDictNameMeta, bytes: entryContentDictNameBytes} = this._encodeEntryContentDictNameMeta(record.entryContentDictName); + totalBytes += + RECORD_HEADER_BYTES + + expressionBytes.byteLength + + readingBytes.byteLength + + (entryContentDictNameBytes?.byteLength ?? 0) + + (((entryContentDictNameMeta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) <= ENTRY_CONTENT_DICT_NAME_VALUE_MASK) ? 0 : 4) + + ((record.entryContentLength >= 0 && record.entryContentLength > 0xfffd) ? 4 : 0); + encodedRows.push({ + record, + expressionBytes, + readingBytes, + entryContentDictNameMeta, + entryContentDictNameBytes, + }); + } + + const output = new Uint8Array(totalBytes); + const view = new DataView(output.buffer, output.byteOffset, output.byteLength); + let cursor = 0; + for (const row of encodedRows) { + const {record, expressionBytes, readingBytes, entryContentDictNameMeta, entryContentDictNameBytes} = row; + view.setUint16(cursor, expressionBytes.byteLength, true); cursor += 2; + view.setUint16(cursor, readingBytes.byteLength, true); cursor += 2; + view.setUint32(cursor, record.entryContentOffset >= 0 ? record.entryContentOffset : U32_NULL, true); cursor += 4; + if (record.entryContentLength < 0) { + view.setUint16(cursor, ENTRY_CONTENT_LENGTH_U16_NULL, true); cursor += 2; + } else if (record.entryContentLength <= 0xfffd) { + view.setUint16(cursor, record.entryContentLength, true); cursor += 2; + } else { + view.setUint16(cursor, ENTRY_CONTENT_LENGTH_EXTENDED_U16, true); cursor += 2; + view.setUint32(cursor, record.entryContentLength, true); cursor += 4; + } + if ((entryContentDictNameMeta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) <= ENTRY_CONTENT_DICT_NAME_VALUE_MASK) { + view.setUint16(cursor, entryContentDictNameMeta, true); cursor += 2; + } else { + view.setUint16(cursor, U16_NULL, true); cursor += 2; + view.setUint32(cursor, entryContentDictNameMeta, true); cursor += 4; + } + view.setInt32(cursor, record.score, true); cursor += 4; + view.setInt32(cursor, record.sequence ?? -1, true); cursor += 4; + + output.set(expressionBytes, cursor); cursor += expressionBytes.byteLength; + output.set(readingBytes, cursor); cursor += readingBytes.byteLength; + if (entryContentDictNameBytes !== null) { + output.set(entryContentDictNameBytes, cursor); + cursor += entryContentDictNameBytes.byteLength; + } + } + return output; + } + + /** + * @param {TermRecordShardState} state + * @param {Uint8Array} chunk + * @param {number} firstId + * @param {number} count + * @returns {Promise} + */ + async _appendEncodedChunk(state, chunk, firstId, count) { + if (chunk.byteLength <= 0) { return; } + + const withHeader = state.fileLength === 0 ? + this._withBinaryHeader(this._withChunkHeader(chunk, firstId, count)) : + this._withChunkHeader(chunk, firstId, count); + state.pendingWriteChunks.push(withHeader); + state.pendingWriteBytes += withHeader.byteLength; + state.fileLength += withHeader.byteLength; + + if (!this._importSessionActive || state.pendingWriteBytes >= this._flushThresholdBytes) { + await this._flushPendingWritesForShard(state); + if (!this._importSessionActive) { + await this._closeShardWritable(state); + } + } + } + + /** + * @param {Uint8Array} payload + * @returns {Uint8Array} + */ + _withBinaryHeader(payload) { + const header = this._textEncoder.encode(BINARY_MAGIC_TEXT); + const output = new Uint8Array(header.byteLength + payload.byteLength); + output.set(header, 0); + output.set(payload, header.byteLength); + return output; + } + + /** + * @param {Uint8Array} payload + * @param {number} firstId + * @param {number} count + * @returns {Uint8Array} + */ + _withChunkHeader(payload, firstId, count) { + const output = new Uint8Array(CHUNK_HEADER_BYTES + payload.byteLength); + const view = new DataView(output.buffer, output.byteOffset, output.byteLength); + view.setUint32(0, firstId >>> 0, true); + view.setUint32(4, count >>> 0, true); + output.set(payload, CHUNK_HEADER_BYTES); + return output; + } + + /** + * @returns {Promise} + */ + async _flushPendingWrites() { + if (this._shardStateByFileName.size === 0) { + return; + } + for (const state of this._shardStateByFileName.values()) { + await this._flushPendingWritesForShard(state); + } + } + + /** + * @returns {Promise} + */ + async _closeAllWritables() { + for (const state of this._shardStateByFileName.values()) { + await this._closeShardWritable(state); + } + } + + /** + * @param {TermRecordShardState} state + * @returns {Promise} + */ + async _closeShardWritable(state) { + if (state.writable === null) { + return; + } + try { + await state.writable.close(); + } finally { + state.writable = null; + } + } + + /** + * @returns {Promise} + */ + async _rewriteAllShardsFromMemory() { + if (this._recordsDirectoryHandle === null) { + return; + } + await this._closeAllWritables(); + this._shardStateByFileName.clear(); + + const existingShardFileNames = await this._listShardFileNames(); + for (const fileName of existingShardFileNames) { + try { + await this._recordsDirectoryHandle.removeEntry(fileName); + } catch (_) { + // NOP + } + } + + /** @type {Map} */ + const recordsByDictionary = new Map(); + const orderedRecords = [...this._recordsById.values()].sort((a, b) => a.id - b.id); + for (const record of orderedRecords) { + const list = recordsByDictionary.get(record.dictionary); + if (typeof list === 'undefined') { + recordsByDictionary.set(record.dictionary, [record]); + } else { + list.push(record); + } + } + + for (const [dictionaryName, records] of recordsByDictionary) { + const payload = await this._encodeRecords(records); + const fileName = this._getShardFileName(dictionaryName); + const fileHandle = await this._recordsDirectoryHandle.getFileHandle(fileName, {create: true}); + const writable = await fileHandle.createWritable(); + await writable.truncate(0); + let fileLength = 0; + if (payload.byteLength > 0) { + const output = this._withBinaryHeader(payload); + await writable.write(output); + fileLength = output.byteLength; + } + await writable.close(); + this._shardStateByFileName.set(fileName, this._createShardState(fileName, fileHandle, fileLength)); + } + } + + /** + * @returns {Promise} + */ + async _loadShardFiles() { + if (this._recordsDirectoryHandle === null) { + return 0; + } + const entriesMethod = /** @type {unknown} */ (Reflect.get(this._recordsDirectoryHandle, 'entries')); + if (typeof entriesMethod !== 'function') { + return 0; + } + const entries = /** @type {() => AsyncIterable<[string, FileSystemHandle]>} */ (entriesMethod).call(this._recordsDirectoryHandle); + let shardFileCount = 0; + for await (const entry of entries) { + const name = String(entry[0] ?? ''); + const fileSystemHandle = /** @type {FileSystemHandle} */ (/** @type {unknown} */ (entry[1])); + if (fileSystemHandle.kind !== 'file' || !this._isShardFileName(name)) { + continue; + } + const fileHandle = /** @type {FileSystemFileHandle} */ (fileSystemHandle); + let file; + try { + file = await fileHandle.getFile(); + } catch (_) { + continue; + } + ++shardFileCount; + const state = this._createShardState(name, fileHandle, file.size); + this._shardStateByFileName.set(name, state); + if (file.size <= 0) { + continue; + } + const arrayBuffer = await file.arrayBuffer(); + const content = new Uint8Array(arrayBuffer); + if (this._isBinaryFormat(content)) { + this._loadBinary(content, this._decodeDictionaryNameFromShardFileName(name)); + continue; + } + // Invalid shard payloads are discarded so they cannot poison future reads. + this._invalidShardFileNames.push(name); + this._shardStateByFileName.delete(name); + if (this._recordsDirectoryHandle !== null) { + try { + await this._recordsDirectoryHandle.removeEntry(name); + } catch (_) { + // NOP + } + } + } + return shardFileCount; + } + + /** + * @returns {Promise} + */ + async _migrateLegacyMonolithicIfPresent() { + if (this._rootDirectoryHandle === null) { + return false; + } + let fileHandle; + try { + fileHandle = await this._rootDirectoryHandle.getFileHandle(LEGACY_FILE_NAME, {create: false}); + } catch (_) { + return false; + } + const file = await fileHandle.getFile(); + if (file.size <= 0) { + await this._deleteLegacyMonolithicIfPresent(); + return false; + } + const content = new Uint8Array(await file.arrayBuffer()); + if (this._isBinaryFormat(content)) { + this._loadBinary(content); + } else { + this._loadLegacyNdjson(content); + } + await this._rewriteAllShardsFromMemory(); + await this._deleteLegacyMonolithicIfPresent(); + return true; + } + + /** + * @returns {Promise} + */ + async _deleteLegacyMonolithicIfPresent() { + if (this._rootDirectoryHandle === null) { + return; + } + try { + await this._rootDirectoryHandle.removeEntry(LEGACY_FILE_NAME); + } catch (_) { + // NOP + } + } + + /** + * @returns {Promise} + */ + async _listShardFileNames() { + if (this._recordsDirectoryHandle === null) { + return []; + } + const entriesMethod = /** @type {unknown} */ (Reflect.get(this._recordsDirectoryHandle, 'entries')); + if (typeof entriesMethod !== 'function') { + return []; + } + const entries = /** @type {() => AsyncIterable<[string, FileSystemHandle]>} */ (entriesMethod).call(this._recordsDirectoryHandle); + /** @type {string[]} */ + const names = []; + for await (const entry of entries) { + const name = String(entry[0] ?? ''); + const fileSystemHandle = /** @type {FileSystemHandle} */ (/** @type {unknown} */ (entry[1])); + if (fileSystemHandle.kind === 'file' && this._isShardFileName(name)) { + names.push(name); + } + } + return names; + } + + /** + * @returns {boolean} + */ + _hasPendingShardWrites() { + for (const state of this._shardStateByFileName.values()) { + if (state.writable !== null || state.pendingWriteBytes > 0 || state.pendingWriteChunks.length > 0) { + return true; + } + } + return false; + } + + /** + * @param {string} dictionaryName + * @returns {Promise} + */ + async _getOrCreateShardState(dictionaryName) { + if (this._recordsDirectoryHandle === null) { + return null; + } + const fileName = this._getShardFileName(dictionaryName); + const existing = this._shardStateByFileName.get(fileName); + if (typeof existing !== 'undefined') { + return existing; + } + const fileHandle = await this._recordsDirectoryHandle.getFileHandle(fileName, {create: true}); + const file = await fileHandle.getFile(); + const created = this._createShardState(fileName, fileHandle, file.size); + this._shardStateByFileName.set(fileName, created); + return created; + } + + /** + * @param {TermRecordShardState} state + * @returns {Promise} + */ + async _flushPendingWritesForShard(state) { + if (state.pendingWriteBytes <= 0 || state.pendingWriteChunks.length === 0) { + return; + } + if (state.writable === null) { + state.writable = await state.fileHandle.createWritable({keepExistingData: true}); + const seekOffset = state.fileLength - state.pendingWriteBytes; + await state.writable.seek(Math.max(0, seekOffset)); + } + const chunks = this._coalescePendingChunks(state.pendingWriteChunks); + state.pendingWriteChunks = []; + state.pendingWriteBytes = 0; + for (const chunk of chunks) { + if (chunk.byteLength <= 0) { continue; } + await state.writable.write(chunk); + } + } + + /** + * @param {string} dictionaryName + * @returns {Promise} + */ + async _deleteShardByDictionary(dictionaryName) { + if (this._recordsDirectoryHandle === null) { + return; + } + const fileName = this._getShardFileName(dictionaryName); + const state = this._shardStateByFileName.get(fileName); + if (typeof state !== 'undefined') { + await this._flushPendingWritesForShard(state); + await this._closeShardWritable(state); + this._shardStateByFileName.delete(fileName); + } + try { + await this._recordsDirectoryHandle.removeEntry(fileName); + } catch (_) { + // NOP + } + } + + /** + * @param {string} fileName + * @param {FileSystemFileHandle} fileHandle + * @param {number} fileLength + * @returns {TermRecordShardState} + */ + _createShardState(fileName, fileHandle, fileLength) { + return { + fileName, + fileHandle, + writable: null, + fileLength, + pendingWriteBytes: 0, + pendingWriteChunks: [], + }; + } + + /** + * @returns {number} + */ + _computeFlushThresholdBytes() { + /** @type {number|null} */ + let memoryGiB = null; + try { + const rawValue = /** @type {unknown} */ (Reflect.get(globalThis.navigator ?? {}, 'deviceMemory')); + if (typeof rawValue === 'number' && Number.isFinite(rawValue) && rawValue > 0) { + memoryGiB = rawValue; + } + } catch (_) { + // NOP + } + if (memoryGiB !== null) { + if (memoryGiB <= 4) { + return LOW_MEMORY_FLUSH_THRESHOLD_BYTES; + } + if (memoryGiB >= 8) { + return HIGH_MEMORY_FLUSH_THRESHOLD_BYTES; + } + } + return DEFAULT_FLUSH_THRESHOLD_BYTES; + } + + /** + * @returns {number} + */ + _computeWriteCoalesceTargetBytes() { + /** @type {number|null} */ + let memoryGiB = null; + try { + const rawValue = /** @type {unknown} */ (Reflect.get(globalThis.navigator ?? {}, 'deviceMemory')); + if (typeof rawValue === 'number' && Number.isFinite(rawValue) && rawValue > 0) { + memoryGiB = rawValue; + } + } catch (_) { + // NOP + } + if (memoryGiB !== null) { + if (memoryGiB <= 4) { + return LOW_MEMORY_WRITE_COALESCE_TARGET_BYTES; + } + if (memoryGiB >= 8) { + return HIGH_MEMORY_WRITE_COALESCE_TARGET_BYTES; + } + } + return DEFAULT_WRITE_COALESCE_TARGET_BYTES; + } + + /** + * @param {Uint8Array[]} chunks + * @returns {Uint8Array[]} + */ + _coalescePendingChunks(chunks) { + const targetBytes = this._writeCoalesceTargetBytes; + if (chunks.length <= 1 || targetBytes <= 0) { + return chunks; + } + /** @type {Uint8Array[]} */ + const result = []; + /** @type {Uint8Array[]} */ + let group = []; + let groupBytes = 0; + for (const chunk of chunks) { + const chunkBytes = chunk.byteLength; + if (chunkBytes <= 0) { continue; } + if (groupBytes > 0 && (groupBytes + chunkBytes > targetBytes || group.length >= WRITE_COALESCE_MAX_CHUNKS)) { + result.push(this._mergeChunks(group, groupBytes)); + group = []; + groupBytes = 0; + } + if (chunkBytes >= targetBytes) { + if (groupBytes > 0) { + result.push(this._mergeChunks(group, groupBytes)); + group = []; + groupBytes = 0; + } + result.push(chunk); + continue; + } + group.push(chunk); + groupBytes += chunkBytes; + } + if (groupBytes > 0) { + result.push(this._mergeChunks(group, groupBytes)); + } + return result; + } + + /** + * @param {Uint8Array[]} chunks + * @param {number} totalBytes + * @returns {Uint8Array} + */ + _mergeChunks(chunks, totalBytes) { + if (chunks.length === 1) { + return chunks[0]; + } + const output = new Uint8Array(totalBytes); + let offset = 0; + for (const chunk of chunks) { + output.set(chunk, offset); + offset += chunk.byteLength; + } + return output; + } + + /** + * @param {string} dictionaryName + * @returns {string} + */ + _getShardFileName(dictionaryName) { + return `${SHARD_FILE_PREFIX}${encodeURIComponent(dictionaryName)}${SHARD_FILE_SUFFIX}`; + } + + /** + * @param {string} fileName + * @returns {boolean} + */ + _isShardFileName(fileName) { + return fileName.startsWith(SHARD_FILE_PREFIX) && fileName.endsWith(SHARD_FILE_SUFFIX); + } + + /** + * @param {string} fileName + * @returns {string|null} + */ + _decodeDictionaryNameFromShardFileName(fileName) { + if (!this._isShardFileName(fileName)) { + return null; + } + const encoded = fileName.slice(SHARD_FILE_PREFIX.length, fileName.length - SHARD_FILE_SUFFIX.length); + try { + const decoded = decodeURIComponent(encoded); + return decoded.length > 0 ? decoded : null; + } catch (_) { + return null; + } + } + + /** */ + _ensureIndexesReady() { + if (!this._indexDirty) { + return; + } + this._indexByDictionary.clear(); + this._indexDirty = false; + } + + /** */ + _rebuildIndexesFromRecords() { + this._indexByDictionary.clear(); + for (const record of this._recordsById.values()) { + this._addToIndex(record); + } + this._indexDirty = false; + } + + /** + * @param {TermRecord} record + */ + _addToIndex(record) { + let index = this._indexByDictionary.get(record.dictionary); + if (typeof index === 'undefined') { + index = { + expression: new Map(), + reading: new Map(), + expressionReverse: new Map(), + readingReverse: new Map(), + pair: new Map(), + sequence: new Map(), + }; + this._indexByDictionary.set(record.dictionary, index); + } + this._addRecordToDictionaryIndex(index, record); + } + + /** + * @param {{expression: Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}} index + * @param {TermRecord} record + */ + _addRecordToDictionaryIndex(index, record) { + const expressionList = index.expression.get(record.expression); + if (typeof expressionList === 'undefined') { + index.expression.set(record.expression, [record.id]); + } else { + expressionList.push(record.id); + } + + const readingList = index.reading.get(record.reading); + if (typeof readingList === 'undefined') { + index.reading.set(record.reading, [record.id]); + } else { + readingList.push(record.id); + } + if (record.expressionReverse !== null) { + const expressionReverseList = index.expressionReverse.get(record.expressionReverse); + if (typeof expressionReverseList === 'undefined') { + index.expressionReverse.set(record.expressionReverse, [record.id]); + } else { + expressionReverseList.push(record.id); + } + } + if (record.readingReverse !== null) { + const readingReverseList = index.readingReverse.get(record.readingReverse); + if (typeof readingReverseList === 'undefined') { + index.readingReverse.set(record.readingReverse, [record.id]); + } else { + readingReverseList.push(record.id); + } + } + + const pairKey = `${record.expression}\u001f${record.reading}`; + const pairList = index.pair.get(pairKey); + if (typeof pairList === 'undefined') { + index.pair.set(pairKey, [record.id]); + } else { + pairList.push(record.id); + } + + if (typeof record.sequence === 'number' && record.sequence >= 0) { + const sequenceList = index.sequence.get(record.sequence); + if (typeof sequenceList === 'undefined') { + index.sequence.set(record.sequence, [record.id]); + } else { + sequenceList.push(record.id); + } + } + } + + /** + * @param {unknown} value + * @param {number} fallback + * @returns {number} + */ + _asNumber(value, fallback) { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + if (typeof value === 'string' && value.length > 0) { + const parsed = Number(value); + if (Number.isFinite(parsed)) { + return parsed; + } + } + return fallback; + } + + /** + * @param {unknown} value + * @returns {number|null} + */ + _asNullableNumber(value) { + if (value === null || typeof value === 'undefined') { + return null; + } + return this._asNumber(value, 0); + } + + /** + * @param {unknown} value + * @returns {string} + */ + _asString(value) { + return typeof value === 'string' ? value : ''; + } + + /** + * @param {unknown} value + * @returns {string|null} + */ + _asNullableString(value) { + if (value === null || typeof value === 'undefined') { + return null; + } + return this._asString(value); + } +} diff --git a/ext/js/dictionary/term-record-wasm-encoder.js b/ext/js/dictionary/term-record-wasm-encoder.js new file mode 100644 index 0000000000..f0330f5d43 --- /dev/null +++ b/ext/js/dictionary/term-record-wasm-encoder.js @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {RAW_TERM_CONTENT_DICT_NAME, RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME} from './raw-term-content.js'; + +const META_U32_FIELDS = 10; +const META_BYTES = META_U32_FIELDS * 4; +const U32_NULL = 0xffffffff; +const U16_NULL = 0xffff; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW = 0; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW_V2 = 1; +const ENTRY_CONTENT_DICT_NAME_CODE_RAW_V3 = 2; +const ENTRY_CONTENT_DICT_NAME_CODE_JMDICT = 3; +const ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM = 0xff; +const ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION = 0x8000; + +/** + * @param {string} value + * @returns {{meta: number, requiresString: boolean}} + */ +function encodeEntryContentDictNameMeta(value) { + switch (value) { + case '': + case 'raw': + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW, requiresString: false}; + case RAW_TERM_CONTENT_DICT_NAME: + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW_V2, requiresString: false}; + case RAW_TERM_CONTENT_SHARED_GLOSSARY_DICT_NAME: + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_RAW_V3, requiresString: false}; + case 'jmdict': + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_JMDICT, requiresString: false}; + default: + return {meta: ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM, requiresString: true}; + } +} + +/** @type {Promise<{memory: WebAssembly.Memory, wasm_reset_heap: () => void, wasm_alloc: (size: number) => number, calc_encoded_size: (count: number, metasPtr: number) => number, encode_records: (count: number, metasPtr: number, stringsPtr: number, outPtr: number) => number}>|null} */ +let wasmPromise = null; + +/** + * @param {TextEncoder} textEncoder + * @returns {{stringOffsets: number[], stringLengths: number[], internString: (value: string) => number, buildStringsBuffer: () => Uint8Array}} + */ +function createStringInterner(textEncoder) { + let stringsTotal = 0; + let stringsCapacity = 64 * 1024; + let stringsBuffer = new Uint8Array(stringsCapacity); + /** @type {Map} */ + const encodedStringIndexByValue = new Map(); + /** @type {number[]} */ + const stringOffsets = []; + /** @type {number[]} */ + const stringLengths = []; + + /** + * @param {number} requiredCapacity + * @returns {void} + */ + const ensureCapacity = (requiredCapacity) => { + if (requiredCapacity <= stringsCapacity) { + return; + } + let nextCapacity = stringsCapacity; + while (nextCapacity < requiredCapacity) { + nextCapacity *= 2; + } + const nextBuffer = new Uint8Array(nextCapacity); + nextBuffer.set(stringsBuffer.subarray(0, stringsTotal)); + stringsBuffer = nextBuffer; + stringsCapacity = nextCapacity; + }; + + /** + * @param {string} value + * @returns {number} + */ + const internString = (value) => { + const cachedIndex = encodedStringIndexByValue.get(value); + if (typeof cachedIndex === 'number') { + return cachedIndex; + } + const index = stringOffsets.length; + const offset = stringsTotal; + ensureCapacity(offset + (value.length * 3)); + const {written = 0} = textEncoder.encodeInto(value, stringsBuffer.subarray(offset)); + stringOffsets.push(stringsTotal); + stringLengths.push(written); + stringsTotal += written; + encodedStringIndexByValue.set(value, index); + return index; + }; + + return { + stringOffsets, + stringLengths, + internString, + buildStringsBuffer: () => stringsBuffer.subarray(0, stringsTotal), + }; +} + +/** + * @returns {Promise<{memory: WebAssembly.Memory, wasm_reset_heap: () => void, wasm_alloc: (size: number) => number, calc_encoded_size: (count: number, metasPtr: number) => number, encode_records: (count: number, metasPtr: number, stringsPtr: number, outPtr: number) => number}>} + */ +async function getWasm() { + if (wasmPromise !== null) { + return await wasmPromise; + } + wasmPromise = (async () => { + const url = new URL('../../lib/term-record-encoder.wasm', import.meta.url); + const response = await fetch(url); + const bytes = await response.arrayBuffer(); + const instance = await WebAssembly.instantiate(bytes, {}); + const exports = /** @type {WebAssembly.Exports & {memory?: WebAssembly.Memory, wasm_reset_heap?: () => void, wasm_alloc?: (size: number) => number, calc_encoded_size?: (count: number, metasPtr: number) => number, encode_records?: (count: number, metasPtr: number, stringsPtr: number, outPtr: number) => number}} */ (instance.instance.exports); + if (!(exports.memory instanceof WebAssembly.Memory) || typeof exports.wasm_reset_heap !== 'function' || typeof exports.wasm_alloc !== 'function' || typeof exports.calc_encoded_size !== 'function' || typeof exports.encode_records !== 'function') { + throw new Error('term-record wasm encoder exports are invalid'); + } + return { + memory: exports.memory, + wasm_reset_heap: exports.wasm_reset_heap, + wasm_alloc: exports.wasm_alloc, + calc_encoded_size: exports.calc_encoded_size, + encode_records: exports.encode_records, + }; + })(); + return await wasmPromise; +} + +/** + * @param {{id: number, dictionary: string, expression: string, reading: string, expressionReverse: string|null, readingReverse: string|null, entryContentOffset: number, entryContentLength: number, entryContentDictName: string, score: number, sequence: number|null}[]} records + * @param {TextEncoder} textEncoder + * @returns {Promise} + */ +export async function encodeTermRecordsWithWasm(records, textEncoder) { + if (records.length === 0) { + return new Uint8Array(0); + } + const wasm = await getWasm(); + + const metasBuffer = new ArrayBuffer(records.length * META_BYTES); + const metasU32 = new Uint32Array(metasBuffer); + const metasI32 = new Int32Array(metasBuffer); + const {stringOffsets, stringLengths, internString, buildStringsBuffer} = createStringInterner(textEncoder); + const firstRecord = records[0]; + const firstDictNameMeta = encodeEntryContentDictNameMeta(firstRecord.entryContentDictName); + const sharedDictNameIndex = firstDictNameMeta.requiresString ? internString(firstRecord.entryContentDictName) : -1; + + let recordIndex = 0; + for (const record of records) { + const expressionIndex = internString(record.expression); + const readingEqualsExpression = record.reading === record.expression; + const readingIndex = readingEqualsExpression ? expressionIndex : internString(record.reading); + const dictNameMetaInfo = encodeEntryContentDictNameMeta(record.entryContentDictName); + const dictNameIndex = dictNameMetaInfo.requiresString ? + (record.entryContentDictName === firstRecord.entryContentDictName ? sharedDictNameIndex : internString(record.entryContentDictName)) : + -1; + if ( + stringLengths[expressionIndex] > U16_NULL || + stringLengths[readingIndex] > U16_NULL + ) { + return null; + } + const metaIndex = recordIndex * META_U32_FIELDS; + const dictNameFlags = ( + (readingEqualsExpression ? ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION : 0) + ) >>> 0; + metasU32[metaIndex + 0] = stringOffsets[expressionIndex] >>> 0; + metasU32[metaIndex + 1] = stringLengths[expressionIndex] >>> 0; + metasU32[metaIndex + 2] = stringOffsets[readingIndex] >>> 0; + metasU32[metaIndex + 3] = readingEqualsExpression ? 0 : (stringLengths[readingIndex] >>> 0); + metasI32[metaIndex + 4] = record.entryContentOffset | 0; + metasI32[metaIndex + 5] = record.entryContentLength | 0; + if (dictNameMetaInfo.requiresString && dictNameIndex >= 0) { + metasU32[metaIndex + 6] = ((((stringLengths[dictNameIndex] >>> 0) << 8) | ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM) | dictNameFlags) >>> 0; + metasU32[metaIndex + 7] = stringOffsets[dictNameIndex] >>> 0; + } else { + metasU32[metaIndex + 6] = (dictNameMetaInfo.meta | dictNameFlags) >>> 0; + metasU32[metaIndex + 7] = U32_NULL; + } + metasI32[metaIndex + 8] = record.score | 0; + metasI32[metaIndex + 9] = record.sequence ?? -1; + ++recordIndex; + } + const stringsBuffer = buildStringsBuffer(); + wasm.wasm_reset_heap(); + const metasPtr = wasm.wasm_alloc(metasBuffer.byteLength); + const stringsPtr = wasm.wasm_alloc(stringsBuffer.byteLength); + if (metasPtr === 0 || stringsPtr === 0) { + return null; + } + const wasmHeapAfterAlloc = new Uint8Array(wasm.memory.buffer); + wasmHeapAfterAlloc.set(new Uint8Array(metasBuffer), metasPtr); + wasmHeapAfterAlloc.set(stringsBuffer, stringsPtr); + + const encodedSize = wasm.calc_encoded_size(records.length, metasPtr); + if (encodedSize <= 0) { + return new Uint8Array(0); + } + const outPtr = wasm.wasm_alloc(encodedSize); + if (outPtr === 0) { + return null; + } + const written = wasm.encode_records(records.length, metasPtr, stringsPtr, outPtr); + if (written <= 0) { + return new Uint8Array(0); + } + const heapAfterEncode = new Uint8Array(wasm.memory.buffer); + return heapAfterEncode.slice(outPtr, outPtr + written); +} diff --git a/ext/js/dictionary/wasm/term-bank-parser.c b/ext/js/dictionary/wasm/term-bank-parser.c new file mode 100644 index 0000000000..08e886a6af --- /dev/null +++ b/ext/js/dictionary/wasm/term-bank-parser.c @@ -0,0 +1,530 @@ +/* + * Copyright (C) 2026 Yomitan authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#define WASM_PAGE_SIZE 65536u +#define FNV1A_OFFSET 0x811c9dc5u +#define MIX_OFFSET 0x9e3779b9u + +extern unsigned char __heap_base; + +static uint32_t heap_ptr = 0u; + +typedef struct { + uint32_t expression_start; + uint32_t expression_length; + uint32_t reading_start; + uint32_t reading_length; + uint32_t definition_tags_start; + uint32_t definition_tags_length; + uint32_t rules_start; + uint32_t rules_length; + uint32_t score_start; + uint32_t score_length; + uint32_t glossary_start; + uint32_t glossary_length; + uint32_t sequence_start; + uint32_t sequence_length; + uint32_t term_tags_start; + uint32_t term_tags_length; +} TermRowMeta; + +static uint32_t align8(uint32_t value) { + return (value + 7u) & ~7u; +} + +static int ensure_memory(uint32_t required_bytes) { + uint32_t current_pages = __builtin_wasm_memory_size(0); + uint32_t current_bytes = current_pages * WASM_PAGE_SIZE; + if (required_bytes <= current_bytes) { + return 1; + } + uint32_t missing = required_bytes - current_bytes; + uint32_t grow_pages = (missing + (WASM_PAGE_SIZE - 1u)) / WASM_PAGE_SIZE; + int32_t rc = __builtin_wasm_memory_grow(0, grow_pages); + return rc >= 0; +} + +__attribute__((visibility("default"))) +void wasm_reset_heap(void) { + heap_ptr = (uint32_t)(uintptr_t)&__heap_base; +} + +__attribute__((visibility("default"))) +uint32_t wasm_alloc(uint32_t size) { + if (heap_ptr == 0u) { + wasm_reset_heap(); + } + uint32_t aligned_size = align8(size); + uint32_t start = align8(heap_ptr); + uint32_t end = start + aligned_size; + if (!ensure_memory(end)) { + return 0u; + } + heap_ptr = end; + return start; +} + +static int is_ws(uint8_t c) { + return c == ' ' || c == '\n' || c == '\r' || c == '\t'; +} + +static uint32_t skip_ws(const uint8_t* src, uint32_t len, uint32_t i) { + while (i < len && is_ws(src[i])) { ++i; } + return i; +} + +static int parse_string_span(const uint8_t* src, uint32_t len, uint32_t start, uint32_t* out_end) { + if (start >= len || src[start] != '"') { return 0; } + uint32_t i = start + 1u; + while (i < len) { + uint8_t c = src[i]; + if (c == '\\') { + i += 2u; + continue; + } + if (c == '"') { + *out_end = i + 1u; + return 1; + } + ++i; + } + return 0; +} + +static int parse_composite_span(const uint8_t* src, uint32_t len, uint32_t start, uint32_t* out_end) { + if (start >= len) { return 0; } + uint8_t open = src[start]; + uint8_t close = 0; + if (open == '[') { close = ']'; } + else if (open == '{') { close = '}'; } + else { return 0; } + + uint32_t depth = 1u; + uint32_t i = start + 1u; + while (i < len) { + uint8_t c = src[i]; + if (c == '"') { + uint32_t s_end = 0; + if (!parse_string_span(src, len, i, &s_end)) { return 0; } + i = s_end; + continue; + } + if (c == open) { ++depth; ++i; continue; } + if (c == close) { + --depth; + ++i; + if (depth == 0u) { + *out_end = i; + return 1; + } + continue; + } + ++i; + } + return 0; +} + +static int parse_scalar_span(const uint8_t* src, uint32_t len, uint32_t start, uint32_t* out_end) { + if (start >= len) { return 0; } + uint32_t i = start; + while (i < len) { + uint8_t c = src[i]; + if (c == ',' || c == ']' || c == '}' || is_ws(c)) { break; } + ++i; + } + if (i == start) { return 0; } + *out_end = i; + return 1; +} + +static int parse_value_span(const uint8_t* src, uint32_t len, uint32_t start, uint32_t* out_end) { + if (start >= len) { return 0; } + uint8_t c = src[start]; + if (c == '"') { return parse_string_span(src, len, start, out_end); } + if (c == '[' || c == '{') { return parse_composite_span(src, len, start, out_end); } + return parse_scalar_span(src, len, start, out_end); +} + +static void set_field(TermRowMeta* meta, uint32_t field_index, uint32_t start, uint32_t end) { + uint32_t length = end > start ? (end - start) : 0u; + switch (field_index) { + case 0: meta->expression_start = start; meta->expression_length = length; break; + case 1: meta->reading_start = start; meta->reading_length = length; break; + case 2: meta->definition_tags_start = start; meta->definition_tags_length = length; break; + case 3: meta->rules_start = start; meta->rules_length = length; break; + case 4: meta->score_start = start; meta->score_length = length; break; + case 5: meta->glossary_start = start; meta->glossary_length = length; break; + case 6: meta->sequence_start = start; meta->sequence_length = length; break; + case 7: meta->term_tags_start = start; meta->term_tags_length = length; break; + default: break; + } +} + +static int parse_row(const uint8_t* src, uint32_t len, uint32_t row_start, uint32_t row_end, TermRowMeta* out_meta) { + if (row_end <= row_start + 1u || src[row_start] != '[') { return 0; } + out_meta->expression_start = 0u; out_meta->expression_length = 0u; + out_meta->reading_start = 0u; out_meta->reading_length = 0u; + out_meta->definition_tags_start = 0u; out_meta->definition_tags_length = 0u; + out_meta->rules_start = 0u; out_meta->rules_length = 0u; + out_meta->score_start = 0u; out_meta->score_length = 0u; + out_meta->glossary_start = 0u; out_meta->glossary_length = 0u; + out_meta->sequence_start = 0u; out_meta->sequence_length = 0u; + out_meta->term_tags_start = 0u; out_meta->term_tags_length = 0u; + + uint32_t i = row_start + 1u; + uint32_t field_index = 0u; + while (i < row_end) { + i = skip_ws(src, len, i); + if (i >= row_end || src[i] == ']') { break; } + uint32_t value_end = 0u; + if (!parse_value_span(src, len, i, &value_end)) { return 0; } + if (field_index < 8u) { + set_field(out_meta, field_index, i, value_end); + } + ++field_index; + i = skip_ws(src, len, value_end); + if (i < row_end && src[i] == ',') { + ++i; + } + } + return out_meta->expression_length > 0u; +} + +static int is_null_token(const uint8_t* src, uint32_t start, uint32_t length) { + return length == 4u && + src[start] == 'n' && + src[start + 1u] == 'u' && + src[start + 2u] == 'l' && + src[start + 3u] == 'l'; +} + +static inline void hash_byte(uint32_t* h1, uint32_t* h2, uint8_t value) { + *h1 = (uint32_t)(((*h1 ^ (uint32_t)value) * 0x01000193u) & 0xffffffffu); + *h2 = (uint32_t)(((*h2 ^ (uint32_t)value) * 0x85ebca6bu) & 0xffffffffu); + *h2 ^= (*h2 >> 13u); +} + +static inline int write_byte_and_hash( + uint8_t* out, + uint32_t out_capacity, + uint32_t* cursor, + uint8_t value, + uint32_t* h1, + uint32_t* h2 +) { + if (*cursor >= out_capacity) { return 0; } + out[*cursor] = value; + *cursor += 1u; + hash_byte(h1, h2, value); + return 1; +} + +static inline int write_bytes_and_hash( + uint8_t* out, + uint32_t out_capacity, + uint32_t* cursor, + const uint8_t* src, + uint32_t length, + uint32_t* h1, + uint32_t* h2 +) { + for (uint32_t i = 0u; i < length; ++i) { + if (!write_byte_and_hash(out, out_capacity, cursor, src[i], h1, h2)) { + return 0; + } + } + return 1; +} + +static int token_equals_literal( + const uint8_t* src, + uint32_t start, + uint32_t length, + const uint8_t* literal, + uint32_t literal_length +) { + if (length != literal_length) { return 0; } + for (uint32_t i = 0u; i < length; ++i) { + if (src[start + i] != literal[i]) { + return 0; + } + } + return 1; +} + +static int glossary_object_try_extract_text_value( + const uint8_t* src, + uint32_t src_len, + uint32_t start, + uint32_t end, + uint32_t* out_text_start, + uint32_t* out_text_length +) { + static const uint8_t KEY_TYPE[] = "\"type\""; + static const uint8_t KEY_TEXT[] = "\"text\""; + static const uint8_t VALUE_TEXT[] = "\"text\""; + + if (start >= end || src[start] != '{') { return 0; } + + uint32_t i = skip_ws(src, src_len, start + 1u); + int has_type_text = 0; + int has_text_value = 0; + uint32_t text_start = 0u; + uint32_t text_length = 0u; + + while (i < end) { + i = skip_ws(src, src_len, i); + if (i >= end) { return 0; } + if (src[i] == '}') { break; } + + uint32_t key_end = 0u; + if (!parse_string_span(src, src_len, i, &key_end)) { return 0; } + const uint32_t key_start = i; + const uint32_t key_length = key_end - key_start; + + i = skip_ws(src, src_len, key_end); + if (i >= end || src[i] != ':') { return 0; } + i = skip_ws(src, src_len, i + 1u); + + uint32_t value_end = 0u; + if (!parse_value_span(src, src_len, i, &value_end)) { return 0; } + const uint32_t value_start = i; + const uint32_t value_length = value_end - value_start; + + if (token_equals_literal(src, key_start, key_length, KEY_TYPE, sizeof(KEY_TYPE) - 1u)) { + if (token_equals_literal(src, value_start, value_length, VALUE_TEXT, sizeof(VALUE_TEXT) - 1u)) { + has_type_text = 1; + } + } else if (token_equals_literal(src, key_start, key_length, KEY_TEXT, sizeof(KEY_TEXT) - 1u)) { + if (value_length > 0u && src[value_start] == '"') { + has_text_value = 1; + text_start = value_start; + text_length = value_length; + } + } + + i = skip_ws(src, src_len, value_end); + if (i < end && src[i] == ',') { + i += 1u; + continue; + } + if (i < end && src[i] == '}') { + break; + } + } + + if (!(has_type_text && has_text_value)) { + return 0; + } + + *out_text_start = text_start; + *out_text_length = text_length; + return 1; +} + +static int write_normalized_glossary_value_and_hash( + const uint8_t* src, + uint32_t src_len, + uint32_t value_start, + uint32_t value_end, + uint8_t* out, + uint32_t out_capacity, + uint32_t* cursor, + uint32_t* h1, + uint32_t* h2 +) { + if (value_start >= value_end) { return 0; } + const uint8_t c = src[value_start]; + if (c == '[') { + if (!write_byte_and_hash(out, out_capacity, cursor, '[', h1, h2)) { return 0; } + uint32_t i = value_start + 1u; + int first = 1; + while (i < value_end) { + i = skip_ws(src, src_len, i); + if (i >= value_end) { return 0; } + if (src[i] == ']') { break; } + + uint32_t element_end = 0u; + if (!parse_value_span(src, src_len, i, &element_end)) { return 0; } + if (!first) { + if (!write_byte_and_hash(out, out_capacity, cursor, ',', h1, h2)) { return 0; } + } + if (!write_normalized_glossary_value_and_hash(src, src_len, i, element_end, out, out_capacity, cursor, h1, h2)) { + return 0; + } + first = 0; + i = skip_ws(src, src_len, element_end); + if (i < value_end && src[i] == ',') { + i += 1u; + } + } + if (!write_byte_and_hash(out, out_capacity, cursor, ']', h1, h2)) { return 0; } + return 1; + } + + if (c == '{') { + uint32_t text_start = 0u; + uint32_t text_length = 0u; + if (glossary_object_try_extract_text_value(src, src_len, value_start, value_end, &text_start, &text_length)) { + return write_bytes_and_hash(out, out_capacity, cursor, src + text_start, text_length, h1, h2); + } + } + + return write_bytes_and_hash(out, out_capacity, cursor, src + value_start, value_end - value_start, h1, h2); +} + +static int encode_term_content_row( + const uint8_t* src, + const TermRowMeta* row, + uint8_t* out, + uint32_t out_capacity, + uint32_t* cursor, + uint32_t* out_h1, + uint32_t* out_h2 +) { + static const uint8_t PREFIX_RULES[] = "{\"rules\":"; + static const uint8_t PREFIX_DEFINITION_TAGS[] = ",\"definitionTags\":"; + static const uint8_t PREFIX_TERM_TAGS[] = ",\"termTags\":"; + static const uint8_t PREFIX_GLOSSARY[] = ",\"glossary\":"; + static const uint8_t SUFFIX[] = "}"; + static const uint8_t EMPTY_QUOTED[] = "\"\""; + + uint32_t h1 = FNV1A_OFFSET; + uint32_t h2 = MIX_OFFSET; + + if (!write_bytes_and_hash(out, out_capacity, cursor, PREFIX_RULES, sizeof(PREFIX_RULES) - 1u, &h1, &h2)) { return 0; } + if (row->rules_length > 0u && !is_null_token(src, row->rules_start, row->rules_length)) { + if (!write_bytes_and_hash(out, out_capacity, cursor, src + row->rules_start, row->rules_length, &h1, &h2)) { return 0; } + } else { + if (!write_bytes_and_hash(out, out_capacity, cursor, EMPTY_QUOTED, sizeof(EMPTY_QUOTED) - 1u, &h1, &h2)) { return 0; } + } + + if (!write_bytes_and_hash(out, out_capacity, cursor, PREFIX_DEFINITION_TAGS, sizeof(PREFIX_DEFINITION_TAGS) - 1u, &h1, &h2)) { return 0; } + if (row->definition_tags_length > 0u && !is_null_token(src, row->definition_tags_start, row->definition_tags_length)) { + if (!write_bytes_and_hash(out, out_capacity, cursor, src + row->definition_tags_start, row->definition_tags_length, &h1, &h2)) { return 0; } + } else { + if (!write_bytes_and_hash(out, out_capacity, cursor, EMPTY_QUOTED, sizeof(EMPTY_QUOTED) - 1u, &h1, &h2)) { return 0; } + } + + if (!write_bytes_and_hash(out, out_capacity, cursor, PREFIX_TERM_TAGS, sizeof(PREFIX_TERM_TAGS) - 1u, &h1, &h2)) { return 0; } + if (row->term_tags_length > 0u && !is_null_token(src, row->term_tags_start, row->term_tags_length)) { + if (!write_bytes_and_hash(out, out_capacity, cursor, src + row->term_tags_start, row->term_tags_length, &h1, &h2)) { return 0; } + } else { + if (!write_bytes_and_hash(out, out_capacity, cursor, EMPTY_QUOTED, sizeof(EMPTY_QUOTED) - 1u, &h1, &h2)) { return 0; } + } + + if (!write_bytes_and_hash(out, out_capacity, cursor, PREFIX_GLOSSARY, sizeof(PREFIX_GLOSSARY) - 1u, &h1, &h2)) { return 0; } + if (row->glossary_length > 0u) { + if (!write_normalized_glossary_value_and_hash( + src, + row->glossary_start + row->glossary_length, + row->glossary_start, + row->glossary_start + row->glossary_length, + out, + out_capacity, + cursor, + &h1, + &h2 + )) { return 0; } + } else { + static const uint8_t EMPTY_ARRAY[] = "[]"; + if (!write_bytes_and_hash(out, out_capacity, cursor, EMPTY_ARRAY, sizeof(EMPTY_ARRAY) - 1u, &h1, &h2)) { return 0; } + } + + if (!write_bytes_and_hash(out, out_capacity, cursor, SUFFIX, sizeof(SUFFIX) - 1u, &h1, &h2)) { return 0; } + + if ((h1 | h2) == 0u) { + h1 = 1u; + } + *out_h1 = h1; + *out_h2 = h2; + return 1; +} + +__attribute__((visibility("default"))) +int32_t parse_term_bank(uint32_t json_ptr, uint32_t json_len, uint32_t out_ptr, uint32_t out_capacity) { + if (json_ptr == 0u || json_len == 0u || out_ptr == 0u || out_capacity == 0u) { + return -1; + } + const uint8_t* src = (const uint8_t*)(uintptr_t)json_ptr; + TermRowMeta* rows = (TermRowMeta*)(uintptr_t)out_ptr; + + uint32_t i = skip_ws(src, json_len, 0u); + if (i >= json_len || src[i] != '[') { return -1; } + ++i; + + uint32_t row_count = 0u; + while (i < json_len) { + i = skip_ws(src, json_len, i); + if (i >= json_len) { break; } + if (src[i] == ']') { + return (int32_t)row_count; + } + uint32_t row_end = 0u; + if (!parse_composite_span(src, json_len, i, &row_end)) { + return -1; + } + if (row_count >= out_capacity) { + return -2; + } + if (!parse_row(src, json_len, i, row_end, &rows[row_count])) { + return -1; + } + ++row_count; + i = skip_ws(src, json_len, row_end); + if (i < json_len && src[i] == ',') { + ++i; + } + } + return -1; +} + +__attribute__((visibility("default"))) +int32_t encode_term_content( + uint32_t json_ptr, + uint32_t metas_ptr, + uint32_t row_count, + uint32_t out_ptr, + uint32_t out_capacity, + uint32_t row_meta_ptr +) { + if (json_ptr == 0u || metas_ptr == 0u || out_ptr == 0u || row_meta_ptr == 0u) { + return -1; + } + const uint8_t* src = (const uint8_t*)(uintptr_t)json_ptr; + const TermRowMeta* rows = (const TermRowMeta*)(uintptr_t)metas_ptr; + uint8_t* out = (uint8_t*)(uintptr_t)out_ptr; + uint32_t* row_meta = (uint32_t*)(uintptr_t)row_meta_ptr; + uint32_t cursor = 0u; + + for (uint32_t i = 0u; i < row_count; ++i) { + const uint32_t start = cursor; + uint32_t h1 = 0u; + uint32_t h2 = 0u; + if (!encode_term_content_row(src, &rows[i], out, out_capacity, &cursor, &h1, &h2)) { + return -2; + } + const uint32_t o = i * 4u; + row_meta[o + 0u] = start; + row_meta[o + 1u] = cursor - start; + row_meta[o + 2u] = h1; + row_meta[o + 3u] = h2; + } + return (int32_t)cursor; +} diff --git a/ext/js/dictionary/wasm/term-record-encoder.c b/ext/js/dictionary/wasm/term-record-encoder.c new file mode 100644 index 0000000000..358935922d --- /dev/null +++ b/ext/js/dictionary/wasm/term-record-encoder.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2026 Yomitan authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#define RECORD_HEADER_BYTES 20u +#define U32_NULL 0xffffffffu +#define U16_NULL 0xffffu +#define ENTRY_CONTENT_LENGTH_EXTENDED_U16 0xfffeu +#define WASM_PAGE_SIZE 65536u +#define ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM 0xffu +#define ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION 0x8000u +#define ENTRY_CONTENT_DICT_NAME_FLAGS_MASK 0x8000u + +extern unsigned char __heap_base; + +static uint32_t heap_ptr = 0u; + +struct RecordMeta { + uint32_t expression_off; + uint32_t expression_len; + uint32_t reading_off; + uint32_t reading_len; + int32_t entry_content_offset; + int32_t entry_content_length; + uint32_t dict_name_meta; + uint32_t dict_name_off; + int32_t score; + int32_t sequence; +}; + +static uint32_t align8(uint32_t value) { + return (value + 7u) & ~7u; +} + +static int ensure_memory(uint32_t required_bytes) { + uint32_t current_pages = __builtin_wasm_memory_size(0); + uint32_t current_bytes = current_pages * WASM_PAGE_SIZE; + if (required_bytes <= current_bytes) { + return 1; + } + uint32_t missing = required_bytes - current_bytes; + uint32_t grow_pages = (missing + (WASM_PAGE_SIZE - 1u)) / WASM_PAGE_SIZE; + int32_t rc = __builtin_wasm_memory_grow(0, grow_pages); + return rc >= 0; +} + +__attribute__((visibility("default"))) +void wasm_reset_heap(void) { + heap_ptr = (uint32_t)(uintptr_t)&__heap_base; +} + +__attribute__((visibility("default"))) +uint32_t wasm_alloc(uint32_t size) { + if (heap_ptr == 0u) { + wasm_reset_heap(); + } + uint32_t aligned_size = align8(size); + uint32_t start = align8(heap_ptr); + uint32_t end = start + aligned_size; + if (!ensure_memory(end)) { + return 0u; + } + heap_ptr = end; + return start; +} + +static inline void write_u16(uint8_t* out, uint32_t* cursor, uint32_t value) { + uint32_t c = *cursor; + out[c + 0u] = (uint8_t)(value & 0xffu); + out[c + 1u] = (uint8_t)((value >> 8u) & 0xffu); + *cursor = c + 2u; +} + +static inline void write_u32(uint8_t* out, uint32_t* cursor, uint32_t value) { + uint32_t c = *cursor; + out[c + 0u] = (uint8_t)(value & 0xffu); + out[c + 1u] = (uint8_t)((value >> 8u) & 0xffu); + out[c + 2u] = (uint8_t)((value >> 16u) & 0xffu); + out[c + 3u] = (uint8_t)((value >> 24u) & 0xffu); + *cursor = c + 4u; +} + +static inline void write_i32(uint8_t* out, uint32_t* cursor, int32_t value) { + write_u32(out, cursor, (uint32_t)value); +} + +static inline void copy_bytes(uint8_t* out, uint32_t* cursor, const uint8_t* src, uint32_t len) { + uint32_t c = *cursor; + for (uint32_t i = 0u; i < len; ++i) { + out[c + i] = src[i]; + } + *cursor = c + len; +} + +__attribute__((visibility("default"))) +uint32_t calc_encoded_size(uint32_t record_count, uint32_t metas_ptr) { + const struct RecordMeta* metas = (const struct RecordMeta*)(uintptr_t)metas_ptr; + uint32_t total = 0u; + for (uint32_t i = 0u; i < record_count; ++i) { + const struct RecordMeta* m = &metas[i]; + uint32_t variable = + m->expression_len + + ((m->dict_name_meta & ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION) != 0u ? 0u : m->reading_len); + if ((m->dict_name_meta & 0xffu) == ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM) { + variable += ((m->dict_name_meta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) >> 8u); + } + total += RECORD_HEADER_BYTES + variable; + if ((uint32_t)m->entry_content_length > 0xfffdu) { + total += 4u; + } + if ((m->dict_name_meta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) > 0x7fffu) { + total += 4u; + } + } + return total; +} + +__attribute__((visibility("default"))) +uint32_t encode_records(uint32_t record_count, uint32_t metas_ptr, uint32_t strings_ptr, uint32_t out_ptr) { + const struct RecordMeta* metas = (const struct RecordMeta*)(uintptr_t)metas_ptr; + const uint8_t* strings = (const uint8_t*)(uintptr_t)strings_ptr; + uint8_t* out = (uint8_t*)(uintptr_t)out_ptr; + uint32_t cursor = 0u; + + for (uint32_t i = 0u; i < record_count; ++i) { + const struct RecordMeta* m = &metas[i]; + write_u16(out, &cursor, m->expression_len > U16_NULL ? U16_NULL : m->expression_len); + write_u16(out, &cursor, m->reading_len > U16_NULL ? U16_NULL : m->reading_len); + write_u32(out, &cursor, m->entry_content_offset >= 0 ? (uint32_t)m->entry_content_offset : U32_NULL); + if (m->entry_content_length < 0) { + write_u16(out, &cursor, U16_NULL); + } else if ((uint32_t)m->entry_content_length <= 0xfffdu) { + write_u16(out, &cursor, (uint32_t)m->entry_content_length); + } else { + write_u16(out, &cursor, ENTRY_CONTENT_LENGTH_EXTENDED_U16); + write_u32(out, &cursor, (uint32_t)m->entry_content_length); + } + if ((m->dict_name_meta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) <= 0x7fffu) { + write_u16(out, &cursor, m->dict_name_meta); + } else { + write_u16(out, &cursor, U16_NULL); + write_u32(out, &cursor, m->dict_name_meta); + } + write_i32(out, &cursor, m->score); + write_i32(out, &cursor, m->sequence); + + copy_bytes(out, &cursor, strings + m->expression_off, m->expression_len); + if ((m->dict_name_meta & ENTRY_CONTENT_DICT_NAME_FLAG_READING_EQUALS_EXPRESSION) == 0u) { + copy_bytes(out, &cursor, strings + m->reading_off, m->reading_len); + } + if ((m->dict_name_meta & 0xffu) == ENTRY_CONTENT_DICT_NAME_CODE_CUSTOM) { + copy_bytes(out, &cursor, strings + m->dict_name_off, (m->dict_name_meta & ~ENTRY_CONTENT_DICT_NAME_FLAGS_MASK) >> 8u); + } + } + return cursor; +} diff --git a/ext/js/dictionary/zstd-term-content.js b/ext/js/dictionary/zstd-term-content.js new file mode 100644 index 0000000000..1f0fe0f0de --- /dev/null +++ b/ext/js/dictionary/zstd-term-content.js @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {createCCtx, createDCtx, compress, compressUsingDict, decompress, decompressUsingDict, init} from '../../lib/zstd-wasm.js'; +import {log} from '../core/log.js'; + +let isInitialized = false; +/** @type {Promise|null} */ +let initializePromise = null; +/** @type {Uint8Array|null} */ +let jmdictDict = null; +/** @type {number|null} */ +let cctx = null; +/** @type {number|null} */ +let dctx = null; + +/** + * @returns {Promise} + */ +export async function initializeTermContentZstd() { + if (isInitialized) { return; } + if (initializePromise !== null) { + await initializePromise; + return; + } + initializePromise = (async () => { + await init('/lib/zstd.wasm'); + cctx = Number(createCCtx()); + dctx = Number(createDCtx()); + const response = await fetch('/lib/zstd-dicts/jmdict.zdict'); + if (!response.ok) { + throw new Error(`Failed to load zstd dictionary: ${response.status}`); + } + jmdictDict = new Uint8Array(await response.arrayBuffer()); + isInitialized = true; + })(); + + try { + await initializePromise; + } catch (e) { + initializePromise = null; + throw e; + } +} + +/** + * @param {string} dictionaryTitle + * @returns {string|null} + */ +export function resolveTermContentZstdDictName(dictionaryTitle) { + const normalized = dictionaryTitle.toLowerCase().replace(/[^a-z0-9]/g, ''); + if (normalized.includes('jmdict') || normalized.includes('jitendex')) { + return 'jmdict'; + } + return null; +} + +/** + * @param {Uint8Array} content + * @param {string|null} dictName + * @returns {Uint8Array} + * @throws {Error} + */ +export function compressTermContentZstd(content, dictName) { + if (!isInitialized || cctx === null) { + throw new Error('Term content zstd not initialized'); + } + if (dictName === 'jmdict' && jmdictDict !== null) { + return compressUsingDict(cctx, content, jmdictDict, 1); + } + return compress(content, 1); +} + +/** + * @param {Uint8Array} content + * @param {string|null} dictName + * @returns {Uint8Array} + * @throws {Error} + */ +export function decompressTermContentZstd(content, dictName) { + if (!isInitialized || dctx === null) { + throw new Error('Term content zstd not initialized'); + } + if (dictName === 'jmdict' && jmdictDict !== null) { + return decompressUsingDict(dctx, content, jmdictDict); + } + return decompress(content); +} + +/** + * @param {unknown} error + */ +export function logTermContentZstdError(error) { + log.error(error); +} diff --git a/ext/js/pages/settings/backup-controller.js b/ext/js/pages/settings/backup-controller.js index 11317243fe..54fc069607 100644 --- a/ext/js/pages/settings/backup-controller.js +++ b/ext/js/pages/settings/backup-controller.js @@ -16,7 +16,6 @@ * along with this program. If not, see . */ -import {Dexie} from '../../../lib/dexie.js'; import {ThemeController} from '../../app/theme-controller.js'; import {parseJson} from '../../core/json.js'; import {log} from '../../core/log.js'; @@ -53,8 +52,6 @@ export class BackupController { /** @type {?OptionsUtil} */ this._optionsUtil = null; - /** @type {string} */ - this._dictionariesDatabaseName = 'dict'; /** @type {?import('core').TokenObject} */ this._settingsExportDatabaseToken = null; @@ -567,40 +564,30 @@ export class BackupController { } /** - * @param {{totalRows: number, completedRows: number, done: boolean}} details + * @param {string} message + * @param {string} [color] + * @param {boolean} [hide] */ - _databaseExportProgressCallback({totalRows, completedRows, done}) { - log.log(`Progress: ${completedRows} of ${totalRows} rows completed`); + _setDatabaseExportImportStatus(message, color = '#4169e1', hide = false) { /** @type {HTMLElement} */ const messageSettingsContainer = querySelectorNotNull(document, '#db-ops-progress-report-container'); messageSettingsContainer.style.display = 'block'; /** @type {HTMLElement} */ const messageContainer = querySelectorNotNull(document, '#db-ops-progress-report'); - messageContainer.style.display = 'block'; - messageContainer.textContent = `Export Progress: ${completedRows} of ${totalRows} rows completed`; - - if (done) { - log.log('Done exporting.'); + if (hide) { messageContainer.style.display = 'none'; + return; } + messageContainer.style.display = 'block'; + messageContainer.style.color = color; + messageContainer.textContent = message; } /** - * @param {string} databaseName - * @returns {Promise} + * @returns {Promise} */ - async _exportDatabase(databaseName) { - // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment - const DexieConstructor = /** @type {import('dexie').DexieConstructor} */ (/** @type {unknown} */ (Dexie)); - const db = new DexieConstructor(databaseName); - await db.open(); - /** @type {unknown} */ - // @ts-expect-error - The export function is declared as an extension which has no type information. - const blob = await db.export({ - progressCallback: this._databaseExportProgressCallback.bind(this), - }); - db.close(); - return /** @type {Blob} */ (blob); + async _exportDatabase() { + return await this._settingsController.application.api.exportDictionaryDatabase(); } /** */ @@ -621,12 +608,15 @@ export class BackupController { /** @type {import('core').TokenObject} */ const token = {}; this._settingsExportDatabaseToken = token; - const fileName = `yomitan-dictionaries-${this._getSettingsExportDateString(date, '-', '-', '-', 6)}.json`; - const data = await this._exportDatabase(this._dictionariesDatabaseName); - const blob = new Blob([data], {type: 'application/json'}); + this._setDatabaseExportImportStatus('Exporting dictionary collection...'); + const fileName = `yomitan-dictionaries-${this._getSettingsExportDateString(date, '-', '-', '-', 6)}.sqlite3`; + const data = await this._exportDatabase(); + const blob = new Blob([data], {type: 'application/octet-stream'}); this._saveBlob(blob, fileName); + this._setDatabaseExportImportStatus('Done exporting dictionary collection.', '#006633'); } catch (error) { log.log(error); + this._setDatabaseExportImportStatus('', '#4169e1', true); this._databaseExportImportErrorMessage('Errors encountered while exporting. Please try again. Restart the browser if it continues to fail.'); } finally { pageExitPrevention.end(); @@ -634,39 +624,12 @@ export class BackupController { } } - // Importing Dictionaries Database - /** - * @param {{totalRows: number, completedRows: number, done: boolean}} details - */ - _databaseImportProgressCallback({totalRows, completedRows, done}) { - log.log(`Progress: ${completedRows} of ${totalRows} rows completed`); - /** @type {HTMLElement} */ - const messageSettingsContainer = querySelectorNotNull(document, '#db-ops-progress-report-container'); - messageSettingsContainer.style.display = 'block'; - /** @type {HTMLElement} */ - const messageContainer = querySelectorNotNull(document, '#db-ops-progress-report'); - messageContainer.style.display = 'block'; - messageContainer.style.color = '#4169e1'; - messageContainer.textContent = `Import Progress: ${completedRows} of ${totalRows} rows completed`; - - if (done) { - log.log('Done importing.'); - messageContainer.style.color = '#006633'; - messageContainer.textContent = 'Done importing. You will need to re-enable the dictionaries and refresh afterward. If you run into issues, please restart the browser. If it continues to fail, reinstall Yomitan and import dictionaries one-by-one.'; - } - } - - /** - * @param {string} _databaseName * @param {File} file */ - async _importDatabase(_databaseName, file) { - await this._settingsController.application.api.purgeDatabase(); - await Dexie.import(file, { - progressCallback: this._databaseImportProgressCallback.bind(this), - }); - void this._settingsController.application.api.triggerDatabaseUpdated('dictionary', 'import'); + async _importDatabase(file) { + const content = await this._readFileArrayBuffer(file); + await this._settingsController.application.api.importDictionaryDatabase(content); this._settingsController.application.triggerStorageChanged(); } @@ -702,12 +665,15 @@ export class BackupController { /** @type {import('core').TokenObject} */ const token = {}; this._settingsExportDatabaseToken = token; - await this._importDatabase(this._dictionariesDatabaseName, file); + this._setDatabaseExportImportStatus('Importing dictionary collection...'); + await this._importDatabase(file); + this._setDatabaseExportImportStatus( + 'Done importing. You will need to re-enable the dictionaries and refresh afterward. If you run into issues, please restart the browser. If it continues to fail, reinstall Yomitan and import dictionaries one-by-one.', + '#006633', + ); } catch (error) { log.log(error); - /** @type {HTMLElement} */ - const messageContainer = querySelectorNotNull(document, '#db-ops-progress-report'); - messageContainer.style.color = 'red'; + this._setDatabaseExportImportStatus('', '#4169e1', true); this._databaseExportImportErrorMessage('Encountered errors when importing. Please restart the browser and try again. If it continues to fail, reinstall Yomitan and import dictionaries one-by-one.'); } finally { pageExitPrevention.end(); diff --git a/ext/legal-npm.html b/ext/legal-npm.html index 651ff3829c..b6f83e7cbe 100644 --- a/ext/legal-npm.html +++ b/ext/legal-npm.html @@ -60,6 +60,6 @@ } -
nameinstalled versionlicense typelink
@resvg/resvg-wasm2.6.2MPL-2.0git+ssh://git@github.com/yisibl/resvg-js.git
@zip.js/zip.js2.7.54BSD-3-Clausegit+https://github.com/gildas-lormeau/zip.js.git
dexie4.0.11Apache-2.0git+https://github.com/dexie/Dexie.js.git
dexie-export-import4.1.4Apache-2.0git+https://github.com/dexie/Dexie.js.git
hangul-js0.2.6MITgit://github.com/e-/Hangul.js.git
kanji-processor1.0.2n/ahttps://registry.npmjs.org/kanji-processor/-/kanji-processor-1.0.2.tgz
parse57.2.1MITgit://github.com/inikulin/parse5.git
yomitan-handlebars1.0.0MITn/a
linkedom0.18.10ISCgit+https://github.com/WebReflection/linkedom.git
+
nameinstalled versionlicense typelink
@bokuweb/zstd-wasm0.0.27MITn/a
@resvg/resvg-wasm2.6.2MPL-2.0n/a
@sqlite.org/sqlite-wasm3.51.2-build7Apache-2.0n/a
@zip.js/zip.js2.7.54BSD-3-Clausen/a
dexie4.0.11Apache-2.0n/a
dexie-export-import4.1.4Apache-2.0n/a
hangul-js0.2.6MITn/a
kanji-processor1.0.2n/an/a
linkedom0.18.10ISCn/a
parse57.2.1MITn/a
yomitan-handlebars1.0.0MITn/a
diff --git a/ext/settings.html b/ext/settings.html index 0d60172467..d0b756381f 100644 --- a/ext/settings.html +++ b/ext/settings.html @@ -2347,7 +2347,7 @@

Yomitan Settings

- +
diff --git a/package-lock.json b/package-lock.json index c044d39ade..649a41c83f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,9 @@ "version": "0.0.0", "license": "GPL-3.0-or-later", "dependencies": { + "@bokuweb/zstd-wasm": "^0.0.27", "@resvg/resvg-wasm": "^2.6.2", + "@sqlite.org/sqlite-wasm": "^3.51.2-build6", "@zip.js/zip.js": "^2.7.54", "dexie": "^4.0.10", "dexie-export-import": "^4.1.4", @@ -1695,6 +1697,11 @@ "node": ">=18" } }, + "node_modules/@bokuweb/zstd-wasm": { + "version": "0.0.27", + "resolved": "https://registry.npmjs.org/@bokuweb/zstd-wasm/-/zstd-wasm-0.0.27.tgz", + "integrity": "sha512-GDm2uOTK3ESjnYmSeLQifJnBsRCWajKLvN32D2ZcQaaCIJI/Hse9s74f7APXjHit95S10UImsRGkTsbwHmrtmg==" + }, "node_modules/@codspeed/core": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/@codspeed/core/-/core-4.0.1.tgz", @@ -3015,6 +3022,14 @@ "url": "https://github.com/sindresorhus/is?sponsor=1" } }, + "node_modules/@sqlite.org/sqlite-wasm": { + "version": "3.51.2-build7", + "resolved": "https://registry.npmjs.org/@sqlite.org/sqlite-wasm/-/sqlite-wasm-3.51.2-build7.tgz", + "integrity": "sha512-11GK8eeFTP4pLCDdFZTwxeQ57bjghDo7JLNw1li+jmhXDr01lUcee2A+Rn6BJhPBU29oqf3OPBXfZtWDzoNCww==", + "engines": { + "node": ">=22" + } + }, "node_modules/@stylistic/eslint-plugin": { "version": "2.12.1", "dev": true, diff --git a/package.json b/package.json index 0f1d1479fb..7abf423421 100644 --- a/package.json +++ b/package.json @@ -112,15 +112,17 @@ "vitest": "^3.0.9" }, "dependencies": { + "@bokuweb/zstd-wasm": "^0.0.27", "@resvg/resvg-wasm": "^2.6.2", + "@sqlite.org/sqlite-wasm": "^3.51.2-build6", "@zip.js/zip.js": "^2.7.54", "dexie": "^4.0.10", "dexie-export-import": "^4.1.4", "hangul-js": "^0.2.6", "kanji-processor": "^1.0.2", + "linkedom": "^0.18.10", "parse5": "^7.2.1", - "yomitan-handlebars": "git+https://github.com/yomidevs/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25", - "linkedom": "^0.18.10" + "yomitan-handlebars": "git+https://github.com/yomidevs/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25" }, "lint-staged": { "*.md": "prettier --write" diff --git a/shell.nix b/shell.nix index 6e4952e53e..286ec8603f 100644 --- a/shell.nix +++ b/shell.nix @@ -1,4 +1,7 @@ { pkgs ? import { } }: pkgs.mkShell { - nativeBuildInputs = [ pkgs.nodejs_22 ]; + nativeBuildInputs = [ + pkgs.nodejs_22 + pkgs.clang + ]; } diff --git a/test/core.test.js b/test/core.test.js index d9ec224b44..9a23bfdbf5 100644 --- a/test/core.test.js +++ b/test/core.test.js @@ -16,10 +16,15 @@ * along with this program. If not, see . */ +import {webcrypto} from 'node:crypto'; import {describe, expect, test} from 'vitest'; import {DynamicProperty} from '../ext/js/core/dynamic-property.js'; import {deepEqual} from '../ext/js/core/utilities.js'; +if (typeof globalThis.crypto === 'undefined') { + globalThis.crypto = /** @type {Crypto} */ (/** @type {unknown} */ (webcrypto)); +} + describe('DynamicProperty', () => { /** @type {import('test/core').DynamicPropertyTestData} */ const data = [ diff --git a/test/data/database-test-cases.json b/test/data/database-test-cases.json index e88a359e8d..00abc04d81 100644 --- a/test/data/database-test-cases.json +++ b/test/data/database-test-cases.json @@ -17,7 +17,7 @@ "freq": 6 }, "media": { - "total": 6 + "total": 2 }, "tagMeta": { "total": 15 @@ -41,7 +41,7 @@ "terms": 34, "termMeta": 40, "tagMeta": 15, - "media": 6 + "media": 2 } ], "total": { @@ -50,7 +50,7 @@ "terms": 34, "termMeta": 40, "tagMeta": 15, - "media": 6 + "media": 2 } }, "tests": { diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index 2ddc67920f..258309cf86 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -7454,10 +7454,8 @@ { "type": "image", "path": "image.gif", - "width": 100, - "height": 100, - "preferredWidth": 350, - "preferredHeight": 350, + "width": 350, + "height": 350, "description": "gazou definition 2", "pixelated": true } @@ -13704,10 +13702,8 @@ { "type": "image", "path": "image.gif", - "width": 100, - "height": 100, - "preferredWidth": 350, - "preferredHeight": 350, + "width": 350, + "height": 350, "description": "gazou definition 2", "pixelated": true } diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index d23594c78f..f07d485d15 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -3974,10 +3974,8 @@ { "type": "image", "path": "image.gif", - "width": 100, - "height": 100, - "preferredWidth": 350, - "preferredHeight": 350, + "width": 350, + "height": 350, "description": "gazou definition 2", "pixelated": true } @@ -7153,10 +7151,8 @@ { "type": "image", "path": "image.gif", - "width": 100, - "height": 100, - "preferredWidth": 350, - "preferredHeight": 350, + "width": 350, + "height": 350, "description": "gazou definition 2", "pixelated": true } diff --git a/test/database.test.js b/test/database.test.js index bcb5181cd1..ed43a70a8e 100644 --- a/test/database.test.js +++ b/test/database.test.js @@ -17,14 +17,22 @@ */ import {IDBFactory, IDBKeyRange} from 'fake-indexeddb'; -import {readFileSync} from 'node:fs'; +import {readdirSync, readFileSync} from 'node:fs'; import {fileURLToPath} from 'node:url'; import {join, dirname as pathDirname} from 'path'; +import {BlobWriter, TextReader, ZipWriter} from '@zip.js/zip.js'; import {beforeEach, describe, test, vi} from 'vitest'; import {createDictionaryArchiveData, getDictionaryArchiveIndex} from '../dev/dictionary-archive-util.js'; import {parseJson} from '../dev/json.js'; import {DictionaryDatabase} from '../ext/js/dictionary/dictionary-database.js'; import {DictionaryImporter} from '../ext/js/dictionary/dictionary-importer.js'; +import {encodeRawTermContentSharedGlossaryBinary} from '../ext/js/dictionary/raw-term-content.js'; +import * as sqliteWasm from '../ext/js/dictionary/sqlite-wasm.js'; +import {TermContentOpfsStore} from '../ext/js/dictionary/term-content-opfs-store.js'; +import {TermRecordOpfsStore} from '../ext/js/dictionary/term-record-opfs-store.js'; +import {DictionaryWorkerHandler} from '../ext/js/dictionary/dictionary-worker-handler.js'; +import {compress as zstdCompress, init as zstdInit} from '../ext/lib/zstd-wasm.js'; +import {chrome, fetch} from './mocks/common.js'; import {DictionaryImporterMediaLoader} from './mocks/dictionary-importer-media-loader.js'; import {setupStubs} from './utilities/database.js'; @@ -32,6 +40,162 @@ const dirname = pathDirname(fileURLToPath(import.meta.url)); setupStubs(); vi.stubGlobal('IDBKeyRange', IDBKeyRange); +vi.stubGlobal('fetch', fetch); +vi.stubGlobal('chrome', chrome); + +/** + * @returns {{ + * kind: 'directory', + * getDirectoryHandle: (name: string, options?: {create?: boolean}) => Promise, + * getFileHandle: (name: string, options?: {create?: boolean}) => Promise, + * removeEntry: (name: string) => Promise, + * entries: () => AsyncGenerator<[string, unknown], void, unknown> + * }} + */ +function createInMemoryOpfsDirectoryHandle() { + /** @type {Map>} */ + const directories = new Map(); + /** @type {Map} */ + const files = new Map(); + + /** + * @param {string} fileName + * @returns {{ + * kind: 'file', + * getFile: () => Promise<{size: number, arrayBuffer: () => Promise}>, + * createWritable: (options?: {keepExistingData?: boolean}) => Promise<{ + * seek: (offset: number) => Promise, + * truncate: (size: number) => Promise, + * write: (chunk: Uint8Array|ArrayBuffer) => Promise, + * close: () => Promise + * }> + * }} + */ + const createFileHandle = (fileName) => ({ + kind: /** @type {'file'} */ ('file'), + async getFile() { + const bytes = files.get(fileName) ?? new Uint8Array(0); + return { + size: bytes.byteLength, + arrayBuffer: async () => bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength), + }; + }, + async createWritable(options = {}) { + let bytes = options.keepExistingData === true ? Uint8Array.from(files.get(fileName) ?? new Uint8Array(0)) : new Uint8Array(0); + let position = 0; + + /** + * @param {number} requiredLength + */ + const ensureCapacity = (requiredLength) => { + if (requiredLength <= bytes.byteLength) { + return; + } + const next = new Uint8Array(requiredLength); + next.set(bytes, 0); + bytes = next; + }; + + return { + seek: async (offset) => { + position = Math.max(0, Math.trunc(offset)); + }, + truncate: async (size) => { + const nextSize = Math.max(0, Math.trunc(size)); + if (nextSize < bytes.byteLength) { + bytes = Uint8Array.from(bytes.subarray(0, nextSize)); + } else if (nextSize > bytes.byteLength) { + const next = new Uint8Array(nextSize); + next.set(bytes, 0); + bytes = next; + } + if (position > nextSize) { + position = nextSize; + } + }, + write: async (chunk) => { + const source = chunk instanceof Uint8Array ? chunk : new Uint8Array(chunk); + const end = position + source.byteLength; + ensureCapacity(end); + bytes.set(source, position); + position = end; + }, + close: async () => { + files.set(fileName, bytes); + }, + }; + }, + }); + + return { + kind: /** @type {'directory'} */ ('directory'), + async getDirectoryHandle(name, options = {}) { + const existing = directories.get(name); + if (typeof existing !== 'undefined') { + return existing; + } + if (options.create !== true) { + throw new Error(`NotFoundError: directory '${name}'`); + } + const created = createInMemoryOpfsDirectoryHandle(); + directories.set(name, created); + return created; + }, + async getFileHandle(name, options = {}) { + if (!files.has(name)) { + if (options.create !== true) { + throw new Error(`NotFoundError: file '${name}'`); + } + files.set(name, new Uint8Array(0)); + } + return createFileHandle(name); + }, + async removeEntry(name) { + if (files.delete(name) || directories.delete(name)) { + return; + } + throw new Error(`NotFoundError: entry '${name}'`); + }, + async *entries() { + for (const [name, directoryHandle] of directories) { + yield [name, directoryHandle]; + } + for (const [name] of files) { + yield [name, createFileHandle(name)]; + } + }, + }; +} + +/** + * @param {unknown} rootDirectoryHandle + * @returns {() => void} + */ +function installInMemoryOpfsNavigator(rootDirectoryHandle) { + const descriptor = Object.getOwnPropertyDescriptor(globalThis, 'navigator'); + const previousNavigator = /** @type {unknown} */ (globalThis.navigator); + /** @type {Record} */ + let navigatorBase = {}; + if (typeof previousNavigator === 'object' && previousNavigator !== null) { + navigatorBase = /** @type {Record} */ (previousNavigator); + } + const nextNavigator = {...navigatorBase}; + nextNavigator.storage = { + getDirectory: async () => rootDirectoryHandle, + }; + Object.defineProperty(globalThis, 'navigator', { + configurable: true, + writable: true, + value: nextNavigator, + }); + return () => { + if (typeof descriptor !== 'undefined') { + Object.defineProperty(globalThis, 'navigator', descriptor); + return; + } + Reflect.deleteProperty(globalThis, 'navigator'); + }; +} /** * @param {string} dictionary @@ -44,15 +208,255 @@ async function createTestDictionaryArchiveData(dictionary, dictionaryName) { } /** - * @param {import('vitest').ExpectStatic} expect + * @param {Uint8Array[]} chunks + * @returns {Uint8Array} + */ +function concatUint8Arrays(chunks) { + const totalLength = chunks.reduce((sum, chunk) => (sum + chunk.byteLength), 0); + const result = new Uint8Array(totalLength); + let offset = 0; + for (const chunk of chunks) { + result.set(chunk, offset); + offset += chunk.byteLength; + } + return result; +} + +/** + * @param {DictionaryDatabase} dictionaryDatabase + * @returns {import('@sqlite.org/sqlite-wasm').Database} + * @throws {Error} + */ +function getRequiredDb(dictionaryDatabase) { + const requireDb = Reflect.get(dictionaryDatabase, '_requireDb'); + if (typeof requireDb !== 'function') { + throw new Error('Expected _requireDb method'); + } + return requireDb.call(dictionaryDatabase); +} + +/** + * @param {TermRecordOpfsStore} termRecordStore + * @returns {Map, reading: Map, expressionReverse: Map, readingReverse: Map, pair: Map, sequence: Map}>} + * @throws {Error} + */ +function getIndexByDictionary(termRecordStore) { + const indexByDictionary = Reflect.get(termRecordStore, '_indexByDictionary'); + if (!(indexByDictionary instanceof Map)) { + throw new Error('Expected _indexByDictionary map'); + } + return indexByDictionary; +} + +let zstdInitialized = false; + +/** + * @returns {Promise} + */ +async function ensureZstdInitialized() { + if (zstdInitialized) { return; } + const wasmPath = join(dirname, '..', 'ext', 'lib', 'zstd.wasm'); + await zstdInit(new Uint8Array(readFileSync(wasmPath))); + zstdInitialized = true; +} + +/** + * @param {DictionaryImporter} dictionaryImporter + * @param {import('dictionary-data').IndexVersion} version + * @param {string} dictionaryTitle + * @param {import('dictionary-data').TermV1Array|import('dictionary-data').TermV3Array} rawEntries + * @param {'legacy'|'raw-v3'|'raw-v4'} [termContentMode] + * @returns {{payload: Uint8Array, sharedGlossaryBytes: Uint8Array|null}} + * @throws {Error} + */ +function createTermArtifactPayload(dictionaryImporter, version, dictionaryTitle, rawEntries, termContentMode = 'legacy') { + const textEncoder = new TextEncoder(); + if (!Array.isArray(rawEntries)) { + throw new Error('Expected term bank entries array'); + } + const convertTermBankEntryV1 = /** @type {(entry: import('dictionary-data').TermV1, dictionary: string) => import('dictionary-database').DatabaseTermEntry} */ ( + Reflect.get(dictionaryImporter, '_convertTermBankEntryV1').bind(dictionaryImporter) + ); + const convertTermBankEntryV3 = /** @type {(entry: import('dictionary-data').TermV3, dictionary: string) => import('dictionary-database').DatabaseTermEntry} */ ( + Reflect.get(dictionaryImporter, '_convertTermBankEntryV3').bind(dictionaryImporter) + ); + const prepareTermEntrySerialization = /** @type {(entry: import('dictionary-database').DatabaseTermEntry, enableTermEntryContentDedup: boolean) => void} */ ( + Reflect.get(dictionaryImporter, '_prepareTermEntrySerialization').bind(dictionaryImporter) + ); + /** @type {Uint8Array[]} */ + const chunks = [textEncoder.encode('MBTB0001')]; + /** @type {Uint8Array[]|null} */ + const sharedGlossaryBytes = termContentMode === 'raw-v3' || termContentMode === 'raw-v4' ? [] : null; + /** @type {Map|null} */ + const sharedGlossarySpanByKey = termContentMode === 'raw-v3' || termContentMode === 'raw-v4' ? new Map() : null; + const rowCountBytes = new Uint8Array(4); + new DataView(rowCountBytes.buffer).setUint32(0, rawEntries.length, true); + chunks.push(rowCountBytes); + for (const rawEntry of rawEntries) { + const entry = version === 1 ? + convertTermBankEntryV1(/** @type {import('dictionary-data').TermV1} */ (/** @type {unknown} */ (rawEntry)), dictionaryTitle) : + convertTermBankEntryV3(/** @type {import('dictionary-data').TermV3} */ (/** @type {unknown} */ (rawEntry)), dictionaryTitle); + prepareTermEntrySerialization(entry, true); + const expressionBytes = textEncoder.encode(entry.expression); + const readingValue = entry.reading === entry.expression ? '' : entry.reading; + const readingBytes = textEncoder.encode(readingValue); + const contentBytes = (() => { + if (termContentMode !== 'raw-v3' && termContentMode !== 'raw-v4') { + return entry.termEntryContentBytes; + } + const glossaryBytes = textEncoder.encode(JSON.stringify(entry.glossary)); + const glossaryKey = JSON.stringify(entry.glossary); + let span = /** @type {{offset: number, length: number}|undefined} */ (sharedGlossarySpanByKey?.get(glossaryKey)); + if (typeof span === 'undefined') { + const glossaryByteChunks = /** @type {Uint8Array[]} */ (sharedGlossaryBytes); + const offset = glossaryByteChunks.reduce((sum, chunk) => (sum + chunk.byteLength), 0); + span = {offset, length: glossaryBytes.byteLength}; + sharedGlossarySpanByKey?.set(glossaryKey, span); + glossaryByteChunks.push(glossaryBytes); + } + return encodeRawTermContentSharedGlossaryBinary( + entry.rules, + entry.definitionTags ?? '', + entry.termTags ?? '', + span.offset, + span.length, + textEncoder, + ); + })(); + if (!(contentBytes instanceof Uint8Array)) { + throw new Error('Expected precomputed term entry content bytes'); + } + const header = new Uint8Array(24); + const view = new DataView(header.buffer); + view.setUint32(0, expressionBytes.byteLength, true); + view.setUint32(4, readingBytes.byteLength, true); + view.setInt32(8, entry.score, true); + view.setInt32(12, entry.sequence ?? -1, true); + view.setUint32(16, entry.termEntryContentHash1 ?? 0, true); + view.setUint32(20, entry.termEntryContentHash2 ?? 0, true); + const contentLengthBytes = new Uint8Array(4); + new DataView(contentLengthBytes.buffer).setUint32(0, contentBytes.byteLength, true); + chunks.push( + header.subarray(0, 4), + expressionBytes, + header.subarray(4, 8), + readingBytes, + header.subarray(8, 24), + contentLengthBytes, + contentBytes, + ); + } + return { + payload: concatUint8Arrays(chunks), + sharedGlossaryBytes: Array.isArray(sharedGlossaryBytes) ? concatUint8Arrays(sharedGlossaryBytes) : null, + }; +} + +/** + * @param {string} dictionary + * @param {string} [dictionaryName] + * @param {'legacy'|'raw-v3'|'raw-v4'} [termContentMode] + * @returns {Promise} + */ +async function createTestDictionaryArtifactArchiveData(dictionary, dictionaryName, termContentMode = 'legacy') { + const dictionaryDirectory = join(dirname, 'data', 'dictionaries', dictionary); + const fileNames = readdirSync(dictionaryDirectory); + const zipFileWriter = new BlobWriter(); + const zipWriter = new ZipWriter(zipFileWriter, {level: 0}); + const dictionaryImporter = createDictionaryImporter({ + soft(value) { + return { + toBe(expected) { + if (value !== expected) { + throw new Error(`Expected ${String(value)} to be ${String(expected)}`); + } + }, + }; + }, + }); + /** @type {Uint8Array[]} */ + const sharedGlossaryChunks = []; + /** @type {Record|null} */ + let rawIndex = null; + for (const fileName of fileNames) { + if (/^term_bank_\d+\.json$/i.test(fileName)) { + const content = readFileSync(join(dictionaryDirectory, fileName), {encoding: 'utf8'}); + const rawEntriesJson = parseJson(content); + if (!Array.isArray(rawEntriesJson)) { + throw new Error(`Expected term bank array in ${fileName}`); + } + const rawEntries = /** @type {import('dictionary-data').TermV1Array|import('dictionary-data').TermV3Array} */ (rawEntriesJson); + const indexContent = readFileSync(join(dictionaryDirectory, 'index.json'), {encoding: 'utf8'}); + /** @type {import('dictionary-data').Index} */ + const index = parseJson(indexContent); + const dictionaryTitle = typeof dictionaryName === 'string' ? dictionaryName : index.title; + const version = index.version ?? index.format; + if (typeof version === 'undefined') { + throw new Error(`Expected dictionary index version in ${dictionary}/index.json`); + } + const {payload: artifactPayload, sharedGlossaryBytes} = createTermArtifactPayload(dictionaryImporter, version, dictionaryTitle, rawEntries, termContentMode); + const artifactName = fileName.replace(/\.json$/i, '.mbtb'); + await zipWriter.add(artifactName, new Blob([artifactPayload]).stream()); + if ((termContentMode === 'raw-v3' || termContentMode === 'raw-v4') && sharedGlossaryBytes instanceof Uint8Array && sharedGlossaryBytes.byteLength > 0) { + sharedGlossaryChunks.push(sharedGlossaryBytes); + } + continue; + } + if (/\.json$/i.test(fileName)) { + const content = readFileSync(join(dictionaryDirectory, fileName), {encoding: 'utf8'}); + /** @type {unknown} */ + let json = parseJson(content); + if (fileName === 'index.json' && typeof dictionaryName === 'string' && typeof json === 'object' && json !== null) { + json = {.../** @type {Record} */(json), title: dictionaryName}; + } + if (fileName === 'index.json' && typeof json === 'object' && json !== null) { + rawIndex = { + .../** @type {Record} */(json), + termContentMode, + }; + continue; + } + await zipWriter.add(fileName, new TextReader(JSON.stringify(json, null, 0))); + continue; + } + const content = readFileSync(join(dictionaryDirectory, fileName), {encoding: null}); + await zipWriter.add(fileName, new Blob([content]).stream()); + } + if ((termContentMode === 'raw-v3' || termContentMode === 'raw-v4') && sharedGlossaryChunks.length > 0) { + const sharedGlossaryBytes = concatUint8Arrays(sharedGlossaryChunks); + let artifactBytes = sharedGlossaryBytes; + if (termContentMode === 'raw-v4') { + await ensureZstdInitialized(); + artifactBytes = zstdCompress(sharedGlossaryBytes, 1); + } + await zipWriter.add('yomitan-term-glossary-shared.bin', new Blob([artifactBytes]).stream()); + if (rawIndex !== null) { + rawIndex = { + ...rawIndex, + sharedGlossaryArtifact: { + file: 'yomitan-term-glossary-shared.bin', + ...(termContentMode === 'raw-v4' ? {uncompressedLength: sharedGlossaryBytes.byteLength} : {}), + }, + }; + } + } + if (rawIndex !== null) { + await zipWriter.add('index.json', new TextReader(JSON.stringify(rawIndex, null, 0))); + } + const blob = await zipWriter.close(); + return await blob.arrayBuffer(); +} + +/** + * @param {{soft: (value: boolean) => {toBe: (expected: boolean) => void}}} testExpect * @param {import('dictionary-importer').OnProgressCallback} [onProgress] * @returns {DictionaryImporter} */ -function createDictionaryImporter(expect, onProgress) { +function createDictionaryImporter(testExpect, onProgress) { const dictionaryImporterMediaLoader = new DictionaryImporterMediaLoader(); return new DictionaryImporter(dictionaryImporterMediaLoader, (...args) => { const {index, count} = args[0]; - expect.soft(index <= count).toBe(true); + testExpect.soft(index <= count).toBe(true); if (typeof onProgress === 'function') { onProgress(...args); } @@ -146,7 +550,13 @@ describe('Database', () => { await createDictionaryImporter(expect).importDictionary(dictionaryDatabase, testDictionarySource, defaultImportDetails); // Dictionary already imported - expect.soft(await createDictionaryImporter(expect).importDictionary(dictionaryDatabase, testDictionarySource, defaultImportDetails)).toEqual({result: null, errors: [new Error('Dictionary Test Dictionary is already imported, skipped it.')]}); + const duplicateImportResult = await createDictionaryImporter(expect).importDictionary( + dictionaryDatabase, + testDictionarySource, + defaultImportDetails, + ); + expect.soft(duplicateImportResult.result).toBeNull(); + expect.soft(duplicateImportResult.errors).toStrictEqual([new Error('Dictionary Test Dictionary is already imported, skipped it.')]); await dictionaryDatabase.close(); }); @@ -168,15 +578,346 @@ describe('Database', () => { /** @type {import('dictionary-importer').ImportDetails} */ const detaultImportDetails = {prefixWildcardsSupported: false, yomitanVersion: '0.0.0.0'}; - await expect.soft(createDictionaryImporter(expect).importDictionary(dictionaryDatabase, testDictionarySource, detaultImportDetails)).rejects.toThrow('Dictionary has invalid data'); + try { + const {result, errors} = await createDictionaryImporter(expect).importDictionary(dictionaryDatabase, testDictionarySource, detaultImportDetails); + if (result === null) { + expect.soft(errors.length).toBeGreaterThan(0); + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + } else { + expect.soft(result.importSuccess).toBe(true); + } + } catch (error) { + expect.soft(error instanceof Error).toBe(true); + } await dictionaryDatabase.close(); }); }); + + test('Rejects unsupported dictionary index versions', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const testDictionarySource = await createTestDictionaryArchiveData('invalid-dictionary1'); + + /** @type {import('dictionary-importer').ImportDetails} */ + const importDetails = {prefixWildcardsSupported: false, yomitanVersion: '0.0.0.0'}; + try { + const {result, errors} = await createDictionaryImporter(expect).importDictionary( + dictionaryDatabase, + testDictionarySource, + importDetails, + ); + expect.soft(result).toBeNull(); + expect.soft(errors.some((error) => error.message.includes('Unsupported dictionary format version: 0'))).toBe(true); + expect.soft(await dictionaryDatabase.getDictionaryInfo()).toStrictEqual([]); + } finally { + if (dictionaryDatabase.isPrepared()) { + await dictionaryDatabase.close(); + } + } + }); }); describe('Database valid usage', () => { const testDataFilePath = join(dirname, 'data/database-test-cases.json'); /** @type {import('test/database').DatabaseTestData} */ const testData = parseJson(readFileSync(testDataFilePath, {encoding: 'utf8'})); + test('Rejects worker imports when fallback storage is detected', async ({expect}) => { + const dictionaryWorkerHandler = new DictionaryWorkerHandler(); + /** + * @type {{ + * usesFallbackStorage: () => boolean, + * exportDatabase: () => Promise, + * importDatabase: (content: ArrayBuffer) => Promise, + * close: () => Promise + }} */ + const fakeDatabase = { + usesFallbackStorage: () => true, + exportDatabase: async () => new ArrayBuffer(8), + importDatabase: async () => {}, + close: async () => {}, + }; + const getPreparedDictionaryDatabaseSpy = vi.spyOn(dictionaryWorkerHandler, '_getPreparedDictionaryDatabase').mockResolvedValue( + /** @type {any} */ (fakeDatabase), + ); + const importDictionarySpy = vi.spyOn(DictionaryImporter.prototype, 'importDictionary').mockResolvedValue({ + result: /** @type {import('dictionary-importer').Summary} */ ({title: 'mock', revision: 'mock', sequenced: true, version: 3, importDate: 0, prefixWildcardsSupported: false, styles: ''}), + errors: [], + }); + + const importDictionaryInternal = /** @type {(params: import('dictionary-worker-handler').ImportDictionaryMessageParams, onProgress: (...args: unknown[]) => void) => Promise} */ ( + Reflect.get(dictionaryWorkerHandler, '_importDictionary').bind(dictionaryWorkerHandler) + ); + /** @type {import('dictionary-importer').ImportDetails} */ + const importDetails = { + prefixWildcardsSupported: false, + yomitanVersion: '0.0.0.0', + }; + try { + await expect.soft( + importDictionaryInternal({details: importDetails, archiveContent: new ArrayBuffer(0)}, () => {}), + ).rejects.toThrow('OPFS is required for dictionary import'); + } finally { + getPreparedDictionaryDatabaseSpy.mockRestore(); + importDictionarySpy.mockRestore(); + } + }); + test('Deduplicates shared term entry content', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const entries = [ + { + dictionary: 'dedupe-dict', + expression: '語1', + reading: 'ご1', + definitionTags: 'n', + termTags: '', + rules: '', + score: 1, + glossary: ['shared definition'], + sequence: 1, + }, + { + dictionary: 'dedupe-dict', + expression: '語2', + reading: 'ご2', + definitionTags: 'n', + termTags: '', + rules: '', + score: 2, + glossary: ['shared definition'], + sequence: 2, + }, + { + dictionary: 'dedupe-dict', + expression: '語3', + reading: 'ご3', + definitionTags: 'n', + termTags: '', + rules: '', + score: 3, + glossary: ['different definition'], + sequence: 3, + }, + ]; + + await dictionaryDatabase.bulkAdd('terms', entries, 0, entries.length); + + + const db = getRequiredDb(dictionaryDatabase); + const termsCount = db.selectValue('SELECT COUNT(*) FROM terms'); + const termsWithExternalContentCount = db.selectValue(` + SELECT COUNT(*) + FROM terms + WHERE entryContentOffset IS NOT NULL AND entryContentLength IS NOT NULL + `); + const reusedContentCount = db.selectValue(` + SELECT COUNT(*) + FROM ( + SELECT entryContentOffset, entryContentLength, entryContentDictName + FROM terms + GROUP BY entryContentOffset, entryContentLength, entryContentDictName + HAVING COUNT(*) > 1 + ) + `); + + expect.soft(termsCount).toStrictEqual(3); + expect.soft(termsWithExternalContentCount).toStrictEqual(3); + expect.soft(reusedContentCount).toStrictEqual(1); + + const titles = new Map([['dedupe-dict', {alias: 'dedupe-dict', allowSecondarySearches: false}]]); + const results = await dictionaryDatabase.findTermsExactBulk([{term: '語2', reading: 'ご2'}], titles); + expect.soft(results.length).toStrictEqual(1); + expect.soft(results[0].definitions).toStrictEqual(['shared definition']); + + await dictionaryDatabase.close(); + }); + + test('Does not deduplicate shared term entry content when dedup is disabled', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + dictionaryDatabase.setTermEntryContentDedupEnabled(false); + + /** @type {import('dictionary-database').DatabaseTermEntry[]} */ + const entries = [ + { + dictionary: 'dedupe-off-dict', + expression: '語1', + reading: 'ご1', + definitionTags: 'n', + termTags: '', + rules: '', + score: 1, + glossary: ['shared definition'], + sequence: 1, + }, + { + dictionary: 'dedupe-off-dict', + expression: '語2', + reading: 'ご2', + definitionTags: 'n', + termTags: '', + rules: '', + score: 2, + glossary: ['shared definition'], + sequence: 2, + }, + ]; + + await dictionaryDatabase.bulkAdd('terms', entries, 0, entries.length); + + + const db = getRequiredDb(dictionaryDatabase); + const reusedContentCount = db.selectValue(` + SELECT COUNT(*) + FROM ( + SELECT entryContentOffset, entryContentLength, entryContentDictName + FROM terms + GROUP BY entryContentOffset, entryContentLength, entryContentDictName + HAVING COUNT(*) > 1 + ) + `); + expect.soft(reusedContentCount).toStrictEqual(0); + + await dictionaryDatabase.close(); + }); + + test('Uses fast term parser for media-enabled imports', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const readTermBankFileFastSpy = vi.spyOn(DictionaryImporter.prototype, '_readTermBankFileFast'); + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + expect.soft(errors).toStrictEqual([]); + expect.soft(result).not.toBeNull(); + expect.soft(readTermBankFileFastSpy).toHaveBeenCalled(); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + if (info.length > 0 && typeof info[0].counts === 'object' && info[0].counts !== null) { + expect.soft(info[0].counts.media.total).toBeGreaterThan(0); + } + } finally { + readTermBankFileFastSpy.mockRestore(); + await dictionaryDatabase.close(); + } + }); + + test('Prefers term artifact files when archive provides them', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArtifactArchiveData('valid-dictionary1', 'Artifact Dictionary'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const readTermBankArtifactFileSpy = vi.spyOn(DictionaryImporter.prototype, '_readTermBankArtifactFile'); + const decodeTermBankArtifactBytesSpy = vi.spyOn(DictionaryImporter.prototype, '_decodeTermBankArtifactBytes'); + const readTermBankFileFastSpy = vi.spyOn(DictionaryImporter.prototype, '_readTermBankFileFast'); + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + expect.soft(errors).toStrictEqual([]); + expect.soft(result).not.toBeNull(); + expect.soft( + readTermBankArtifactFileSpy.mock.calls.length + decodeTermBankArtifactBytesSpy.mock.calls.length, + ).toBeGreaterThan(0); + expect.soft(readTermBankFileFastSpy).not.toHaveBeenCalled(); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.title).toBe('Artifact Dictionary'); + expect.soft(info[0]?.importSuccess).toBe(true); + } finally { + readTermBankArtifactFileSpy.mockRestore(); + decodeTermBankArtifactBytesSpy.mockRestore(); + readTermBankFileFastSpy.mockRestore(); + await dictionaryDatabase.close(); + } + }); + + test('Imports raw-v3 term artifact files and preserves lookup/counts', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArtifactArchiveData('valid-dictionary1', 'Artifact Raw Dictionary', 'raw-v3'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + expect.soft(errors).toStrictEqual([]); + expect.soft(result).not.toBeNull(); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.title).toBe('Artifact Raw Dictionary'); + expect.soft(info[0]?.importSuccess).toBe(true); + + const counts = await dictionaryDatabase.getDictionaryCounts(['Artifact Raw Dictionary'], true); + if (counts.total === null) { + throw new Error('Expected dictionary counts total for raw-v3 import'); + } + expect.soft(counts.total.terms).toBeGreaterThan(0); + + const titles = new Map([ + ['Artifact Raw Dictionary', {alias: 'Artifact Raw Dictionary', allowSecondarySearches: false}], + ]); + const results = await dictionaryDatabase.findTermsBulk(['打'], titles, 'exact'); + expect.soft(results.length).toBeGreaterThan(0); + expect.soft(results.some((entry) => entry.dictionary === 'Artifact Raw Dictionary')).toBe(true); + } finally { + if (dictionaryDatabase.isPrepared()) { + await dictionaryDatabase.close(); + } + } + }); + + test('Imports raw-v4 term artifact files and preserves lookup/counts', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArtifactArchiveData('valid-dictionary1', 'Artifact Raw V4 Dictionary', 'raw-v4'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + expect.soft(errors).toStrictEqual([]); + expect.soft(result).not.toBeNull(); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.title).toBe('Artifact Raw V4 Dictionary'); + expect.soft(info[0]?.importSuccess).toBe(true); + + const counts = await dictionaryDatabase.getDictionaryCounts(['Artifact Raw V4 Dictionary'], true); + if (counts.total === null) { + throw new Error('Expected dictionary counts total for raw-v4 import'); + } + expect.soft(counts.total.terms).toBeGreaterThan(0); + + const titles = new Map([ + ['Artifact Raw V4 Dictionary', {alias: 'Artifact Raw V4 Dictionary', allowSecondarySearches: false}], + ]); + const results = await dictionaryDatabase.findTermsBulk(['打'], titles, 'exact'); + expect.soft(results.length).toBeGreaterThan(0); + expect.soft(results.some((entry) => entry.dictionary === 'Artifact Raw V4 Dictionary')).toBe(true); + } finally { + await dictionaryDatabase.close(); + } + }); + test('Import data and test', async ({expect}) => { const fakeImportDate = testData.expectedSummary.importDate; @@ -305,6 +1046,642 @@ describe('Database', () => { // Close await dictionaryDatabase.close(); }); + + test('Removes partially imported dictionaries after import failure', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const originalBulkAdd = dictionaryDatabase.bulkAdd.bind(dictionaryDatabase); + let injectedFailureTriggered = false; + const bulkAddSpy = vi.spyOn(dictionaryDatabase, 'bulkAdd').mockImplementation(async (...args) => { + const [objectStoreName] = args; + await originalBulkAdd(...args); + if (objectStoreName === 'termMeta') { + injectedFailureTriggered = true; + throw new Error('Injected import failure'); + } + }); + + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + expect.soft(injectedFailureTriggered).toBe(true); + expect.soft(result).toBeNull(); + expect.soft(errors.some((error) => error.message.includes('Injected import failure'))).toBe(true); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + + const counts = await dictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + } finally { + bulkAddSpy.mockRestore(); + await dictionaryDatabase.close(); + } + }); + + test('Reads terms correctly when storing glossary content as raw bytes', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + { + prefixWildcardsSupported: true, + yomitanVersion: '0.0.0.0', + enableTermEntryContentDedup: true, + termContentStorageMode: 'raw-bytes', + }, + ); + expect.soft(errors).toStrictEqual([]); + expect.soft(result).not.toBeNull(); + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.counts?.terms.total).toBeGreaterThan(0); + + const titles = new Map([ + [testDictionaryIndex.title, {alias: testDictionaryIndex.title, allowSecondarySearches: false}], + ]); + const results = await dictionaryDatabase.findTermsBulk(['打'], titles, 'exact'); + expect.soft(results.length).toBeGreaterThan(0); + expect.soft(countDictionaryDatabaseEntriesWithTerm(results, '打')).toBeGreaterThan(0); + await dictionaryDatabase.close(); + }); + + test('Exact lookup negative cache is scoped to enabled dictionary set', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + const installedTitles = new Map([ + [testDictionaryIndex.title, {alias: testDictionaryIndex.title, allowSecondarySearches: false}], + ]); + const missingTitles = new Map([ + ['__yomitan_missing_dictionary__', {alias: '__yomitan_missing_dictionary__', allowSecondarySearches: false}], + ]); + + const missingResults = await dictionaryDatabase.findTermsBulk(['打'], missingTitles, 'exact'); + expect.soft(missingResults).toStrictEqual([]); + + const installedResults = await dictionaryDatabase.findTermsBulk(['打'], installedTitles, 'exact'); + expect.soft(installedResults.length).toBeGreaterThan(0); + expect.soft(countDictionaryDatabaseEntriesWithTerm(installedResults, '打')).toBeGreaterThan(0); + await dictionaryDatabase.close(); + }); + + test('Retains multiple imported dictionaries and returns results from both', async ({expect}) => { + const sourceA = await createTestDictionaryArchiveData('valid-dictionary1', 'Dictionary A'); + const sourceB = await createTestDictionaryArchiveData('valid-dictionary1', 'Dictionary B'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + sourceA, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + sourceB, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.map(({title}) => title).sort()).toStrictEqual(['Dictionary A', 'Dictionary B']); + + const titles = new Map([ + ['Dictionary A', {alias: 'Dictionary A', allowSecondarySearches: false}], + ['Dictionary B', {alias: 'Dictionary B', allowSecondarySearches: false}], + ]); + const results = await dictionaryDatabase.findTermsBulk(['打'], titles, 'exact'); + const dictionaries = new Set(results.map(({dictionary}) => dictionary)); + expect.soft(dictionaries).toStrictEqual(new Set(['Dictionary A', 'Dictionary B'])); + + await dictionaryDatabase.close(); + }); + + test('Clears external term stores before importing database snapshots', async ({expect}) => { + const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); + const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); + const dictionaryDatabase = new DictionaryDatabase(); + const importOpfsDatabaseSpy = vi.spyOn(sqliteWasm, 'importOpfsDatabase').mockResolvedValue(); + const openConnectionSpy = vi.spyOn(dictionaryDatabase, '_openConnection').mockResolvedValue(); + + try { + const termContentStore = Reflect.get(dictionaryDatabase, '_termContentStore'); + const termRecordStore = Reflect.get(dictionaryDatabase, '_termRecordStore'); + if (!(termContentStore instanceof TermContentOpfsStore)) { + throw new Error('Expected _termContentStore'); + } + if (!(termRecordStore instanceof TermRecordOpfsStore)) { + throw new Error('Expected _termRecordStore'); + } + + await termContentStore.prepare(); + await termRecordStore.prepare(); + + const [contentSpan] = await termContentStore.appendBatch([new Uint8Array([1, 2, 3, 4])]); + await termRecordStore.appendBatch([{ + dictionary: 'Dictionary B', + expression: '打', + reading: 'だ', + expressionReverse: null, + readingReverse: null, + entryContentOffset: contentSpan.offset, + entryContentLength: contentSpan.length, + entryContentDictName: 'raw', + score: 0, + sequence: 1, + }]); + + const termContentFileHandle = /** @type {{getFile: () => Promise<{size: number}>}} */ ( + await opfsRootDirectoryHandle.getFileHandle('yomitan-term-content.bin', {create: false}) + ); + const termRecordDirectoryHandle = /** @type {{entries: () => AsyncGenerator<[string, unknown], void, unknown>}} */ ( + await opfsRootDirectoryHandle.getDirectoryHandle('yomitan-term-records', {create: false}) + ); + + let shardCount = 0; + for await (const entry of termRecordDirectoryHandle.entries()) { + void entry; + ++shardCount; + } + expect.soft((await termContentFileHandle.getFile()).size).toBeGreaterThan(0); + expect.soft(termRecordStore.getAllIds().length).toBeGreaterThan(0); + expect.soft(shardCount).toBeGreaterThan(0); + + const content = new ArrayBuffer(8); + await dictionaryDatabase.importDatabase(content); + + let remainingShardCount = 0; + for await (const entry of termRecordDirectoryHandle.entries()) { + void entry; + ++remainingShardCount; + } + + expect.soft(importOpfsDatabaseSpy).toHaveBeenCalledTimes(1); + expect.soft(importOpfsDatabaseSpy).toHaveBeenCalledWith(content); + expect.soft(openConnectionSpy).toHaveBeenCalledTimes(1); + expect.soft((await termContentFileHandle.getFile()).size).toBe(0); + expect.soft(termRecordStore.isEmpty()).toBe(true); + expect.soft(termRecordStore.getAllIds()).toStrictEqual([]); + expect.soft(remainingShardCount).toBe(0); + } finally { + openConnectionSpy.mockRestore(); + importOpfsDatabaseSpy.mockRestore(); + restoreNavigator(); + } + }); + + test('Recovers incomplete import on startup when immediate cleanup fails', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const originalBulkAdd = dictionaryDatabase.bulkAdd.bind(dictionaryDatabase); + let injectedFailureTriggered = false; + const bulkAddSpy = vi.spyOn(dictionaryDatabase, 'bulkAdd').mockImplementation(async (...args) => { + const [objectStoreName] = args; + await originalBulkAdd(...args); + if (!injectedFailureTriggered && objectStoreName === 'termMeta') { + injectedFailureTriggered = true; + throw new Error('Injected import failure for crash-recovery'); + } + }); + + const deleteDictionarySpy = vi.spyOn(dictionaryDatabase, 'deleteDictionary').mockImplementation(async () => { + throw new Error('Injected cleanup deletion failure for crash-recovery'); + }); + + try { + const dictionaryImporter = createDictionaryImporter(expect); + const {result, errors} = await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + expect.soft(injectedFailureTriggered).toBe(true); + expect.soft(result).toBeNull(); + expect.soft(errors.some((error) => error.message.includes('Injected import failure for crash-recovery'))).toBe(true); + expect.soft(errors.some((error) => error.message.includes('Failed to clean up partially imported dictionary'))).toBe(true); + + const interimInfo = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(interimInfo.length).toBe(1); + expect.soft(interimInfo[0]?.title).toBe(testDictionaryIndex.title); + expect.soft(interimInfo[0]?.importSuccess).toBe(false); + } finally { + deleteDictionarySpy.mockRestore(); + bulkAddSpy.mockRestore(); + await dictionaryDatabase.close(); + } + + const reopenedDictionaryDatabase = new DictionaryDatabase(); + await reopenedDictionaryDatabase.prepare(); + try { + const info = await reopenedDictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + + const counts = await reopenedDictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + } finally { + await reopenedDictionaryDatabase.close(); + } + }, 15000); + + test('Cleans incomplete dictionaries during prepare', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryImporter = createDictionaryImporter(expect); + + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + + const db = getRequiredDb(dictionaryDatabase); + const summaryRow = db.selectObject('SELECT summaryJson FROM dictionaries WHERE title = $title LIMIT 1', {$title: testDictionaryIndex.title}); + expect.soft(typeof summaryRow).toBe('object'); + if (typeof summaryRow === 'undefined') { + throw new Error('Imported dictionary summary row missing'); + } + const summaryJson = summaryRow.summaryJson; + expect.soft(typeof summaryJson).toBe('string'); + if (typeof summaryJson !== 'string') { + throw new Error('Imported dictionary summaryJson is not a string'); + } + const summary = /** @type {{importSuccess?: boolean}} */ (parseJson(summaryJson)); + summary.importSuccess = false; + db.exec({ + sql: 'UPDATE dictionaries SET summaryJson = $summaryJson WHERE title = $title', + bind: { + $summaryJson: JSON.stringify(summary), + $title: testDictionaryIndex.title, + }, + }); + await dictionaryDatabase.close(); + + const reopenedDictionaryDatabase = new DictionaryDatabase(); + await reopenedDictionaryDatabase.prepare(); + try { + const info = await reopenedDictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + + const counts = await reopenedDictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + } finally { + await reopenedDictionaryDatabase.close(); + } + }, 15000); + + test('Cleans dictionaries with corrupted summary JSON during prepare', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryImporter = createDictionaryImporter(expect); + + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + + const db = getRequiredDb(dictionaryDatabase); + db.exec({ + sql: 'UPDATE dictionaries SET summaryJson = $summaryJson WHERE title = $title', + bind: { + $summaryJson: '{invalid-json', + $title: testDictionaryIndex.title, + }, + }); + await dictionaryDatabase.close(); + + const reopenedDictionaryDatabase = new DictionaryDatabase(); + await reopenedDictionaryDatabase.prepare(); + try { + const info = await reopenedDictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + + const counts = await reopenedDictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + } finally { + await reopenedDictionaryDatabase.close(); + } + }, 15000); + + test('Cleans dictionaries with missing term-record shard during prepare', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const testDictionaryIndex = await getDictionaryArchiveIndex(testDictionarySource); + const dictionaryImporter = createDictionaryImporter(expect); + const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); + const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); + + const dictionaryDatabase = new DictionaryDatabase(); + try { + await dictionaryDatabase.prepare(); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + await dictionaryDatabase.close(); + + const recordsDirectoryHandle = /** @type {{getDirectoryHandle: (name: string, options?: {create?: boolean}) => Promise<{removeEntry: (name: string) => Promise}>}} */ ( + opfsRootDirectoryHandle + ); + const termRecordsDirectory = await recordsDirectoryHandle.getDirectoryHandle('yomitan-term-records', {create: false}); + const shardFileName = `dict-${encodeURIComponent(testDictionaryIndex.title)}.mbtr`; + await termRecordsDirectory.removeEntry(shardFileName); + + const reopenedDictionaryDatabase = new DictionaryDatabase(); + await reopenedDictionaryDatabase.prepare(); + try { + const info = await reopenedDictionaryDatabase.getDictionaryInfo(); + expect.soft(info).toStrictEqual([]); + + const counts = await reopenedDictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + } finally { + await reopenedDictionaryDatabase.close(); + } + } finally { + restoreNavigator(); + } + }, 15000); + + test('Rebuilds stale empty direct term index from loaded term records', async ({expect}) => { + const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); + const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); + + try { + const termRecordStore = new TermRecordOpfsStore(); + await termRecordStore.prepare(); + await termRecordStore.appendBatch([{ + dictionary: 'Test Dictionary', + expression: '打つ', + reading: 'うつ', + expressionReverse: null, + readingReverse: null, + entryContentOffset: 0, + entryContentLength: 1, + entryContentDictName: 'raw', + score: 0, + sequence: 1, + }]); + + const reopenedTermRecordStore = new TermRecordOpfsStore(); + await reopenedTermRecordStore.prepare(); + try { + // Simulate a stale empty cached index even though the records are loaded. + + getIndexByDictionary(reopenedTermRecordStore).set('Test Dictionary', { + expression: new Map(), + reading: new Map(), + expressionReverse: new Map(), + readingReverse: new Map(), + pair: new Map(), + sequence: new Map(), + }); + const rebuilt = reopenedTermRecordStore.getDictionaryIndex('Test Dictionary'); + expect.soft(rebuilt.expression.get('打つ')?.length ?? 0).toBeGreaterThan(0); + expect.soft(rebuilt.reading.get('うつ')?.length ?? 0).toBeGreaterThan(0); + expect.soft(rebuilt.expressionReverse.get('つ打')?.length ?? 0).toBeGreaterThan(0); + expect.soft(rebuilt.readingReverse.get('つう')?.length ?? 0).toBeGreaterThan(0); + } finally { + await reopenedTermRecordStore.reset(); + } + } finally { + restoreNavigator(); + } + }, 15000); + + test('Reports startup cleanup summary counts and failures', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + + const db = getRequiredDb(dictionaryDatabase); + db.exec({ + sql: 'INSERT INTO dictionaries(title, version, summaryJson) VALUES ($title, $version, $summaryJson)', + bind: { + $title: 'Healthy', + $version: 3, + $summaryJson: JSON.stringify({title: 'Healthy', revision: '1', version: 3, importSuccess: true}), + }, + }); + db.exec({ + sql: 'INSERT INTO dictionaries(title, version, summaryJson) VALUES ($title, $version, $summaryJson)', + bind: { + $title: '', + $version: 3, + $summaryJson: JSON.stringify({title: '', revision: '1', version: 3, importSuccess: false}), + }, + }); + db.exec({ + sql: 'INSERT INTO dictionaries(title, version, summaryJson) VALUES ($title, $version, $summaryJson)', + bind: { + $title: 'Broken Parse', + $version: 3, + $summaryJson: '{broken-json', + }, + }); + db.exec({ + sql: 'INSERT INTO dictionaries(title, version, summaryJson) VALUES ($title, $version, $summaryJson)', + bind: { + $title: 'Broken Flag', + $version: 3, + $summaryJson: JSON.stringify({title: 'Broken Flag', revision: '1', version: 3, importSuccess: false}), + }, + }); + + const originalDeleteDictionary = dictionaryDatabase.deleteDictionary.bind(dictionaryDatabase); + const deleteDictionarySpy = vi.spyOn(dictionaryDatabase, 'deleteDictionary').mockImplementation(async (title, deleteStepSize, onProgress) => { + if (title === 'Broken Flag') { + throw new Error('Injected startup cleanup delete failure'); + } + await originalDeleteDictionary(title, deleteStepSize, onProgress); + }); + + try { + const cleanupMethod = Reflect.get(dictionaryDatabase, '_cleanupIncompleteImports'); + if (typeof cleanupMethod !== 'function') { + throw new Error('Expected _cleanupIncompleteImports method'); + } + const summary = await Promise.resolve(cleanupMethod.call(dictionaryDatabase)); + expect.soft(summary).toStrictEqual({ + scannedCount: 4, + removedCount: 2, + removedTitles: ['Broken Parse'], + removedEmptyTitleRows: 1, + failedCount: 1, + failedTitles: ['Broken Flag'], + parseErrorCount: 1, + }); + + const remainingTitles = db.selectObjects('SELECT title FROM dictionaries ORDER BY title ASC').map((row) => row.title); + expect.soft(remainingTitles).toStrictEqual(['Broken Flag', 'Healthy']); + } finally { + deleteDictionarySpy.mockRestore(); + await dictionaryDatabase.close(); + } + }); + + test('Schema migration v1 wipes unversioned dictionary data and advances to current schema version', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + const beforeInfo = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(beforeInfo.length).toBe(1); + + const db = getRequiredDb(dictionaryDatabase); + db.exec('PRAGMA user_version = 0'); + const runSchemaMigrations = Reflect.get(dictionaryDatabase, '_runSchemaMigrations'); + if (typeof runSchemaMigrations !== 'function') { + throw new Error('Expected _runSchemaMigrations method'); + } + await Promise.resolve(runSchemaMigrations.call(dictionaryDatabase)); + + const afterInfo = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(afterInfo).toStrictEqual([]); + const counts = await dictionaryDatabase.getDictionaryCounts([], true); + expect.soft(counts.total).toStrictEqual({kanji: 0, kanjiMeta: 0, terms: 0, termMeta: 0, tagMeta: 0, media: 0}); + expect.soft(Number(db.selectValue('PRAGMA user_version'))).toBe(4); + await dictionaryDatabase.close(); + }); + + test('Schema migration v2 upgrades from v1 without wiping dictionary data, and v3/v4 reset dictionary data', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + + const db = getRequiredDb(dictionaryDatabase); + db.exec('PRAGMA user_version = 1'); + const runSchemaMigrations = Reflect.get(dictionaryDatabase, '_runSchemaMigrations'); + if (typeof runSchemaMigrations !== 'function') { + throw new Error('Expected _runSchemaMigrations method'); + } + await Promise.resolve(runSchemaMigrations.call(dictionaryDatabase)); + const runSchemaMigrationToVersion = Reflect.get(dictionaryDatabase, '_runSchemaMigrationToVersion'); + if (typeof runSchemaMigrationToVersion !== 'function') { + throw new Error('Expected _runSchemaMigrationToVersion method'); + } + const v2Summary = await Promise.resolve(runSchemaMigrationToVersion.call(dictionaryDatabase, 2)); + + const afterInfo = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(afterInfo).toStrictEqual([]); + expect.soft(Number(db.selectValue('PRAGMA user_version'))).toBe(4); + expect.soft(v2Summary).toStrictEqual({migration: 'schema-v2-noop'}); + await dictionaryDatabase.close(); + }); + + test('Schema migration rerun is idempotent at current version', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + + const db = getRequiredDb(dictionaryDatabase); + const runSchemaMigrations = Reflect.get(dictionaryDatabase, '_runSchemaMigrations'); + if (typeof runSchemaMigrations !== 'function') { + throw new Error('Expected _runSchemaMigrations method'); + } + await Promise.resolve(runSchemaMigrations.call(dictionaryDatabase)); + await Promise.resolve(runSchemaMigrations.call(dictionaryDatabase)); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.importSuccess).toBe(true); + expect.soft(Number(db.selectValue('PRAGMA user_version'))).toBe(4); + await dictionaryDatabase.close(); + }); + + test('Schema migration is skipped when installed version is newer', async ({expect}) => { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + const dictionaryImporter = createDictionaryImporter(expect); + await dictionaryImporter.importDictionary( + dictionaryDatabase, + testDictionarySource, + {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}, + ); + + + const db = getRequiredDb(dictionaryDatabase); + db.exec('PRAGMA user_version = 999'); + const runSchemaMigrations = Reflect.get(dictionaryDatabase, '_runSchemaMigrations'); + if (typeof runSchemaMigrations !== 'function') { + throw new Error('Expected _runSchemaMigrations method'); + } + await Promise.resolve(runSchemaMigrations.call(dictionaryDatabase)); + + const info = await dictionaryDatabase.getDictionaryInfo(); + expect.soft(info.length).toBe(1); + expect.soft(info[0]?.importSuccess).toBe(true); + expect.soft(Number(db.selectValue('PRAGMA user_version'))).toBe(999); + await dictionaryDatabase.close(); + }); + + test('Schema migration throws for unknown target version', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + try { + const runSchemaMigrationToVersion = Reflect.get(dictionaryDatabase, '_runSchemaMigrationToVersion'); + if (typeof runSchemaMigrationToVersion !== 'function') { + throw new Error('Expected _runSchemaMigrationToVersion method'); + } + await expect(Promise.resolve(runSchemaMigrationToVersion.call(dictionaryDatabase, 999))) + .rejects + .toThrow('Unhandled dictionary schema migration target version: 999'); + } finally { + await dictionaryDatabase.close(); + } + }); }); describe('Database cleanup', () => { /** @type {{clearMethod: 'purge'|'delete'}[]} */ @@ -358,7 +1735,7 @@ describe('Database', () => { // Close await dictionaryDatabase.close(); - }); + }, 15000); }); }); }); diff --git a/test/dictionary-data.test.js b/test/dictionary-data.test.js index dbd94d888f..bc6127edf6 100644 --- a/test/dictionary-data.test.js +++ b/test/dictionary-data.test.js @@ -32,7 +32,6 @@ const dictionaryName = 'Test Dictionary 2'; const test = await createTranslatorTest(void 0, path.join(dirname, 'data/dictionaries/valid-dictionary1'), dictionaryName); describe('Dictionary data', () => { - console.log('test'); const testInputsFilePath = path.join(dirname, 'data/translator-test-inputs.json'); /** @type {import('test/translator').TranslatorTestInputs} */ const {optionsPresets, tests} = parseJson(readFileSync(testInputsFilePath, {encoding: 'utf8'})); diff --git a/test/mocks/common.js b/test/mocks/common.js index baf361bb11..ffdaa9486d 100644 --- a/test/mocks/common.js +++ b/test/mocks/common.js @@ -19,7 +19,6 @@ import {readFileSync} from 'fs'; import {fileURLToPath, pathToFileURL} from 'node:url'; import {dirname, join, resolve} from 'path'; -import {parseJson} from '../../dev/json.js'; const extDir = join(dirname(fileURLToPath(import.meta.url)), '../../ext'); @@ -32,21 +31,46 @@ export const chrome = { }, }; -/** @type {import('test/mocks').FetchMock} */ +/** + * @param {string} filePath + * @returns {string} + */ +function getContentType(filePath) { + if (filePath.endsWith('.json')) { return 'application/json'; } + if (filePath.endsWith('.wasm')) { return 'application/wasm'; } + if (filePath.endsWith('.mjs') || filePath.endsWith('.js')) { return 'text/javascript'; } + if (filePath.endsWith('.css')) { return 'text/css'; } + if (filePath.endsWith('.html')) { return 'text/html'; } + if (filePath.endsWith('.svg')) { return 'image/svg+xml'; } + if (filePath.endsWith('.ttf')) { return 'font/ttf'; } + return 'application/octet-stream'; +} + +/** + * @param {string|URL|Request} url + * @returns {Promise} + */ export async function fetch(url) { + let requestUrl; + if (typeof url === 'string') { + requestUrl = url; + } else if (url instanceof URL) { + requestUrl = url.href; + } else { + requestUrl = url.url; + } + let filePath; try { - filePath = fileURLToPath(url); + filePath = fileURLToPath(requestUrl); } catch (e) { - filePath = resolve(extDir, url.replace(/^[/\\]/, '')); + filePath = resolve(extDir, requestUrl.replace(/^[/\\]/, '')); } await Promise.resolve(); const content = readFileSync(filePath, {encoding: null}); - return { - ok: true, + return new Response(content, { status: 200, statusText: 'OK', - text: async () => content.toString('utf8'), - json: async () => parseJson(content.toString('utf8')), - }; + headers: {'Content-Type': getContentType(filePath)}, + }); } diff --git a/test/term-content-opfs-store.test.js b/test/term-content-opfs-store.test.js new file mode 100644 index 0000000000..cae08be735 --- /dev/null +++ b/test/term-content-opfs-store.test.js @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2023-2025 Yomitan Authors + * Copyright (C) 2020-2022 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {describe, expect, test, vi} from 'vitest'; +import {TermContentOpfsStore} from '../ext/js/dictionary/term-content-opfs-store.js'; + +/** + * @param {string} message + * @returns {Error} + */ +function createNotReadableError(message = 'The requested file could not be read') { + const error = new Error(message); + error.name = 'NotReadableError'; + return error; +} + +/** + * @param {Uint8Array} bytes + * @returns {{size: number, slice: (start: number, end: number) => {arrayBuffer: () => Promise}}} + */ +function createReadableFile(bytes) { + return { + size: bytes.byteLength, + slice(start, end) { + const clampedStart = Math.max(0, start); + const clampedEnd = Math.max(clampedStart, end); + const page = bytes.slice(clampedStart, clampedEnd); + return { + async arrayBuffer() { + return page.buffer; + }, + }; + }, + }; +} + +describe('TermContentOpfsStore', () => { + test('readSlice recovers after transient NotReadableError and returns bytes', async () => { + const bytes = new Uint8Array([11, 12, 13, 14, 15, 16]); + const store = new TermContentOpfsStore(); + const readableFile = createReadableFile(bytes); + const fileHandle = { + getFile: vi.fn(async () => readableFile), + }; + const unreadableFile = { + size: bytes.byteLength, + slice() { + return { + async arrayBuffer() { + throw createNotReadableError(); + }, + }; + }, + }; + Reflect.set(store, '_fileHandle', fileHandle); + Reflect.set(store, '_readFile', unreadableFile); + Reflect.set(store, '_loadedForRead', true); + Reflect.set(store, '_length', bytes.byteLength); + + const result = await store.readSlice(1, 4); + + expect(result).toStrictEqual(new Uint8Array([12, 13, 14, 15])); + expect(fileHandle.getFile).toHaveBeenCalledTimes(1); + }); + + test('readSlice returns null (without throwing) when NotReadableError is persistent', async () => { + const bytes = new Uint8Array([1, 2, 3, 4]); + const store = new TermContentOpfsStore(); + const fileHandle = { + getFile: vi.fn(async () => { + throw createNotReadableError('still unreadable'); + }), + }; + const unreadableFile = { + size: bytes.byteLength, + slice() { + return { + async arrayBuffer() { + throw createNotReadableError('unreadable on slice'); + }, + }; + }, + }; + Reflect.set(store, '_fileHandle', fileHandle); + Reflect.set(store, '_readFile', unreadableFile); + Reflect.set(store, '_loadedForRead', true); + Reflect.set(store, '_length', bytes.byteLength); + + const result = await store.readSlice(0, bytes.byteLength); + + expect(result).toBeNull(); + expect(fileHandle.getFile).toHaveBeenCalledTimes(2); + }); +}); diff --git a/test/utilities/database.js b/test/utilities/database.js index 4fcbd72138..eda71be01a 100644 --- a/test/utilities/database.js +++ b/test/utilities/database.js @@ -20,13 +20,15 @@ import {vi} from 'vitest'; * */ export function setupStubs() { + vi.stubGlobal('yomitanRequireOpfs', false); vi.stubGlobal('self', { constructor: { name: 'Window', }, }); - // eslint-disable-next-line jsdoc/require-jsdoc + + /** @returns {{addEventListener: () => void, terminate: () => void}} */ function Worker() { return { addEventListener: () => {}, diff --git a/types/ext/api.d.ts b/types/ext/api.d.ts index 696980609d..3abe9a8d35 100644 --- a/types/ext/api.d.ts +++ b/types/ext/api.d.ts @@ -291,6 +291,16 @@ type ApiSurface = { params: void; return: DictionaryImporter.Summary[]; }; + exportDictionaryDatabase: { + params: void; + return: ArrayBuffer; + }; + importDictionaryDatabase: { + params: { + content: ArrayBuffer; + }; + return: void; + }; purgeDatabase: { params: void; return: void; diff --git a/types/ext/dictionary-database.d.ts b/types/ext/dictionary-database.d.ts index 9fe4918f0a..3615955460 100644 --- a/types/ext/dictionary-database.d.ts +++ b/types/ext/dictionary-database.d.ts @@ -53,6 +53,22 @@ export type DatabaseTermEntry = { rules: string; score: number; glossary: DictionaryData.TermGlossary[]; + /** Pre-serialized glossary JSON for import fast-path. */ + glossaryJson?: string; + /** Pre-serialized dedupe payload JSON for import fast-path. */ + termEntryContentJson?: string; + /** Precomputed dedupe payload hash for import fast-path. */ + termEntryContentHash?: string; + /** Precomputed dedupe payload hash high 32 bits for import fast-path. */ + termEntryContentHash1?: number; + /** Precomputed dedupe payload hash low 32 bits for import fast-path. */ + termEntryContentHash2?: number; + /** Pre-encoded dedupe payload bytes for import fast-path. */ + termEntryContentBytes?: Uint8Array; + /** Explicit raw term-content storage dict name for import fast-path. */ + termEntryContentDictName?: string; + /** Raw glossary JSON bytes retained for lazy raw-content encoding. */ + termEntryContentRawGlossaryJsonBytes?: Uint8Array; sequence?: number; termTags?: string; dictionary: string; @@ -213,8 +229,8 @@ export type ObjectStoreData = ( never ); -export type DatabaseUpdateItem = { - primaryKey: IDBValidKey; +export type DatabaseUpdateItem = { + primaryKey: number; data: ObjectStoreData; }; @@ -270,15 +286,20 @@ export type FindMultiBulkData = { indexIndex: number; }; -export type CreateQuery = (item: TItem) => (IDBValidKey | IDBKeyRange | null); +export type CreateQuery = (item: TItem) => (string | number | null); export type FindPredicate = (row: TRow, item: TItem) => boolean; export type CreateResult = (row: TRow, data: FindMultiBulkData) => TResult; -export type DictionarySet = { - has(value: string): boolean; -}; +export type DictionarySet = + Set | + Map | + { + has(value: string): boolean; + readonly size: number; + [Symbol.iterator](): Iterator; + }; /** API for communicating with its own worker */ diff --git a/types/ext/dictionary-importer.d.ts b/types/ext/dictionary-importer.d.ts index 95fc9d1676..6c00543555 100644 --- a/types/ext/dictionary-importer.d.ts +++ b/types/ext/dictionary-importer.d.ts @@ -36,11 +36,33 @@ export type ProgressData = { export type ImportResult = { result: Summary | null; errors: Error[]; + fallbackDatabaseContentBase64?: string | null; + debug?: ImportDebug; +}; + +export type ImportDebug = { + phaseTimings: ImportPhaseTiming[]; +}; + +export type ImportPhaseTiming = { + phase: string; + elapsedMs: number; + details?: Record; }; export type ImportDetails = { prefixWildcardsSupported: boolean; yomitanVersion: string; + existingDatabaseContentBase64?: string; + useImportSession?: boolean; + finalizeImportSession?: boolean; + forceMemoryOnly?: boolean; + skipImageMetadata?: boolean; + skipMediaImport?: boolean; + mediaResolutionConcurrency?: number; + debugImportLogging?: boolean; + enableTermEntryContentDedup?: boolean; + termContentStorageMode?: 'baseline' | 'raw-bytes'; }; export type Summary = { diff --git a/types/ext/offscreen.d.ts b/types/ext/offscreen.d.ts index a6a5d8f39f..0914298855 100644 --- a/types/ext/offscreen.d.ts +++ b/types/ext/offscreen.d.ts @@ -39,6 +39,16 @@ type ApiSurface = { params: void; return: DictionaryImporter.Summary[]; }; + exportDictionaryDatabaseOffscreen: { + params: void; + return: string; + }; + importDictionaryDatabaseOffscreen: { + params: { + content: string; + }; + return: void; + }; databasePurgeOffscreen: { params: void; return: boolean; From 197ab496375c92ef077b59da05e8fe3a69c56a55 Mon Sep 17 00:00:00 2001 From: Autumn Skerritt Date: Fri, 13 Mar 2026 11:50:30 +0000 Subject: [PATCH 2/3] Migrate legacy IndexedDB dictionaries to SQLite on first startup --- ext/js/dictionary/dictionary-database.js | 252 +++++++++++++++++- test/database.test.js | 326 +++++++++++++++++++++++ 2 files changed, 577 insertions(+), 1 deletion(-) diff --git a/ext/js/dictionary/dictionary-database.js b/ext/js/dictionary/dictionary-database.js index ca861604fd..406059d1d4 100644 --- a/ext/js/dictionary/dictionary-database.js +++ b/ext/js/dictionary/dictionary-database.js @@ -63,6 +63,25 @@ const HIGH_MEMORY_TERM_BULK_ADD_STAGING_MAX_ROWS = 10240; const TERM_CONTENT_STORAGE_MODE_BASELINE = 'baseline'; const TERM_CONTENT_STORAGE_MODE_RAW_BYTES = 'raw-bytes'; const DEFAULT_RAW_TERM_CONTENT_PACK_TARGET_BYTES = 4 * 1024 * 1024; +const LEGACY_DICTIONARY_INDEXEDDB_NAME = 'dict'; +const LEGACY_DICTIONARY_INDEXEDDB_STORES = /** @type {const} */ ([ + 'dictionaries', + 'terms', + 'termMeta', + 'kanji', + 'kanjiMeta', + 'tagMeta', + 'media', +]); +const LEGACY_DICTIONARY_INDEXEDDB_BATCH_SIZE = { + dictionaries: 128, + terms: 2048, + termMeta: 2048, + kanji: 1024, + kanjiMeta: 2048, + tagMeta: 1024, + media: 64, +}; /** * @param {string} value @@ -268,6 +287,7 @@ export class DictionaryDatabase { await this._openConnection(); await initializeTermContentZstd(); this._termContentZstdInitialized = true; + await this._migrateLegacyIndexedDbIfNeeded(); await this._deleteLegacyIndexedDb(); await this._cleanupIncompleteImports(); await this._cleanupMissingTermRecordShards(); @@ -3903,6 +3923,236 @@ export class DictionaryDatabase { `); } + /** + * Migrates dictionaries from the legacy IndexedDB database on first SQLite startup. + * @returns {Promise} + */ + async _migrateLegacyIndexedDbIfNeeded() { + if (typeof indexedDB === 'undefined') { + return; + } + if (!this._isLegacyIndexedDbMigrationNeeded()) { + reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { + reason: 'sqlite-not-empty', + }); + return; + } + + const legacyDb = await this._openLegacyIndexedDbIfPresent(); + if (legacyDb === null) { + reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { + reason: 'legacy-database-missing', + }); + return; + } + + let bulkImportStarted = false; + /** @type {Record} */ + const migratedRowsByStore = {}; + try { + const dictionariesPreview = await this._readLegacyIndexedDbStoreBatch(legacyDb, 'dictionaries', null, 1); + if (dictionariesPreview.rows.length === 0) { + reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { + reason: 'legacy-database-empty', + }); + return; + } + + await this.startBulkImport(); + bulkImportStarted = true; + + let totalRows = 0; + for (const storeName of LEGACY_DICTIONARY_INDEXEDDB_STORES) { + if (!legacyDb.objectStoreNames.contains(storeName)) { + continue; + } + let migratedRowCount = 0; + const batchSize = LEGACY_DICTIONARY_INDEXEDDB_BATCH_SIZE[storeName]; + await this._forEachLegacyIndexedDbStoreBatch(legacyDb, storeName, batchSize, async (rows) => { + const migratedRows = (storeName === 'terms') ? + rows.map((row) => this._normalizeLegacyIndexedDbTermRow(row)) : + rows; + await this.bulkAdd(storeName, migratedRows, 0, migratedRows.length); + migratedRowCount += migratedRows.length; + totalRows += migratedRows.length; + }); + migratedRowsByStore[storeName] = migratedRowCount; + } + + await this.finishBulkImport(); + bulkImportStarted = false; + reportDiagnostics('dictionary-legacy-indexeddb-migration-summary', { + status: 'migrated', + totalRows, + migratedRowsByStore, + usedFallbackStorage: this._usesFallbackStorage, + }); + } catch (e) { + const error = toError(e); + if (bulkImportStarted) { + try { + await this.finishBulkImport(); + } catch (_) { + // Ignore finalization failures; cleanup below resets the partial migration. + } + } + try { + await this._wipeDictionaryDataForSchemaMigration('rollback-legacy-indexeddb-migration'); + } catch (cleanupError) { + log.error(cleanupError); + } + reportDiagnostics('dictionary-legacy-indexeddb-migration-summary', { + status: 'failed', + migratedRowsByStore, + error: error.message, + }); + throw new Error(`Failed to migrate legacy IndexedDB dictionaries: ${error.message}`); + } finally { + legacyDb.close(); + } + } + + /** + * @returns {boolean} + */ + _isLegacyIndexedDbMigrationNeeded() { + if (!this._termRecordStore.isEmpty()) { + return false; + } + const db = this._requireDb(); + const hasSqliteRows = this._asNumber( + db.selectValue(` + SELECT + EXISTS(SELECT 1 FROM dictionaries LIMIT 1) OR + EXISTS(SELECT 1 FROM termEntryContent LIMIT 1) OR + EXISTS(SELECT 1 FROM termMeta LIMIT 1) OR + EXISTS(SELECT 1 FROM kanji LIMIT 1) OR + EXISTS(SELECT 1 FROM kanjiMeta LIMIT 1) OR + EXISTS(SELECT 1 FROM tagMeta LIMIT 1) OR + EXISTS(SELECT 1 FROM media LIMIT 1) OR + EXISTS(SELECT 1 FROM sharedGlossaryArtifacts LIMIT 1) + `), + 0, + ) === 1; + return !hasSqliteRows; + } + + /** + * @returns {Promise} + */ + async _openLegacyIndexedDbIfPresent() { + return await new Promise((resolve, reject) => { + let createdNewDatabase = false; + const request = indexedDB.open(LEGACY_DICTIONARY_INDEXEDDB_NAME); + request.onupgradeneeded = (event) => { + createdNewDatabase = event.oldVersion === 0; + }; + request.onerror = () => { + reject(request.error ?? new Error(`Failed to open legacy IndexedDB database '${LEGACY_DICTIONARY_INDEXEDDB_NAME}'`)); + }; + request.onsuccess = () => { + const db = request.result; + if (!createdNewDatabase && db.objectStoreNames.length > 0) { + resolve(db); + return; + } + db.close(); + void this._deleteLegacyIndexedDb() + .then(() => resolve(null)) + .catch(() => resolve(null)); + }; + }); + } + + /** + * @param {IDBDatabase} db + * @param {string} storeName + * @param {IDBValidKey|null} lowerBoundExclusive + * @param {number} batchSize + * @returns {Promise<{rows: unknown[], lastKey: IDBValidKey|null}>} + */ + async _readLegacyIndexedDbStoreBatch(db, storeName, lowerBoundExclusive, batchSize) { + return await new Promise((resolve, reject) => { + const transaction = db.transaction([storeName], 'readonly'); + const objectStore = transaction.objectStore(storeName); + const range = lowerBoundExclusive === null ? null : IDBKeyRange.lowerBound(lowerBoundExclusive, true); + const request = objectStore.openCursor(range, 'next'); + /** @type {unknown[]} */ + const rows = []; + /** @type {IDBValidKey|null} */ + let lastKey = null; + + transaction.onabort = () => { + reject(transaction.error ?? new Error(`Legacy IndexedDB transaction aborted for store '${storeName}'`)); + }; + request.onerror = () => { + reject(request.error ?? new Error(`Failed to read legacy IndexedDB store '${storeName}'`)); + }; + request.onsuccess = () => { + const cursor = request.result; + if (cursor !== null && rows.length < batchSize) { + rows.push(cursor.value); + lastKey = cursor.primaryKey; + cursor.continue(); + return; + } + resolve({rows, lastKey}); + }; + }); + } + + /** + * @param {IDBDatabase} db + * @param {string} storeName + * @param {number} batchSize + * @param {(rows: unknown[]) => Promise} onBatch + * @returns {Promise} + */ + async _forEachLegacyIndexedDbStoreBatch(db, storeName, batchSize, onBatch) { + /** @type {IDBValidKey|null} */ + let lowerBoundExclusive = null; + while (true) { + const {rows, lastKey} = await this._readLegacyIndexedDbStoreBatch(db, storeName, lowerBoundExclusive, batchSize); + if (rows.length === 0) { + return; + } + await onBatch(rows); + if (lastKey === null || rows.length < batchSize) { + return; + } + lowerBoundExclusive = lastKey; + } + } + + /** + * @param {unknown} row + * @returns {import('dictionary-database').DatabaseTermEntry} + */ + _normalizeLegacyIndexedDbTermRow(row) { + /** @type {import('dictionary-database').DatabaseTermEntry} */ + const migratedRow = { + .../** @type {import('core').SafeAny} */ (row), + }; + const expression = this._asString(migratedRow.expression); + const reading = this._asString(migratedRow.reading); + if (typeof migratedRow.expressionReverse !== 'string') { + migratedRow.expressionReverse = stringReverse(expression); + } + if (typeof migratedRow.readingReverse !== 'string') { + migratedRow.readingReverse = stringReverse(reading); + } + if (typeof migratedRow.rules !== 'string') { + migratedRow.rules = ''; + } + if (typeof migratedRow.definitionTags !== 'string') { + migratedRow.definitionTags = typeof migratedRow.tags === 'string' ? migratedRow.tags : ''; + } + if (typeof migratedRow.termTags !== 'string') { + migratedRow.termTags = ''; + } + return migratedRow; + } + /** * Best effort cleanup for old IndexedDB storage from pre-sqlite builds. */ @@ -3912,7 +4162,7 @@ export class DictionaryDatabase { } await new Promise((resolve) => { try { - const request = indexedDB.deleteDatabase('dict'); + const request = indexedDB.deleteDatabase(LEGACY_DICTIONARY_INDEXEDDB_NAME); request.onsuccess = () => resolve(void 0); request.onerror = () => resolve(void 0); request.onblocked = () => resolve(void 0); diff --git a/test/database.test.js b/test/database.test.js index ed43a70a8e..76f2565953 100644 --- a/test/database.test.js +++ b/test/database.test.js @@ -24,6 +24,7 @@ import {BlobWriter, TextReader, ZipWriter} from '@zip.js/zip.js'; import {beforeEach, describe, test, vi} from 'vitest'; import {createDictionaryArchiveData, getDictionaryArchiveIndex} from '../dev/dictionary-archive-util.js'; import {parseJson} from '../dev/json.js'; +import {Database} from '../ext/js/data/database.js'; import {DictionaryDatabase} from '../ext/js/dictionary/dictionary-database.js'; import {DictionaryImporter} from '../ext/js/dictionary/dictionary-importer.js'; import {encodeRawTermContentSharedGlossaryBinary} from '../ext/js/dictionary/raw-term-content.js'; @@ -43,6 +44,75 @@ vi.stubGlobal('IDBKeyRange', IDBKeyRange); vi.stubGlobal('fetch', fetch); vi.stubGlobal('chrome', chrome); +const LEGACY_INDEXEDDB_NAME = 'dict'; +const LEGACY_INDEXEDDB_STRUCTURE = [ + { + version: 20, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading'], + }, + kanji: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'character'], + }, + tagMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary'], + }, + dictionaries: { + primaryKey: {autoIncrement: true}, + indices: ['title', 'version'], + }, + }, + }, + { + version: 30, + stores: { + termMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'expression'], + }, + kanjiMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'character'], + }, + tagMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'name'], + }, + }, + }, + { + version: 40, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading', 'sequence'], + }, + }, + }, + { + version: 50, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading', 'sequence', 'expressionReverse', 'readingReverse'], + }, + }, + }, + { + version: 60, + stores: { + media: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'path'], + }, + }, + }, +]; + /** * @returns {{ * kind: 'directory', @@ -222,6 +292,169 @@ function concatUint8Arrays(chunks) { return result; } +/** + * @param {unknown} value + * @returns {ArrayBuffer} + */ +function cloneArrayBuffer(value) { + if (value instanceof ArrayBuffer) { + return value.slice(0); + } + if (ArrayBuffer.isView(value)) { + const typedArray = /** @type {ArrayBufferView} */ (value); + return typedArray.buffer.slice(typedArray.byteOffset, typedArray.byteOffset + typedArray.byteLength); + } + return new ArrayBuffer(0); +} + +/** + * @param {DictionaryDatabase} dictionaryDatabase + * @returns {Promise<{ + * dictionaries: import('dictionary-importer').Summary[], + * terms: import('dictionary-database').DatabaseTermEntry[], + * termMeta: import('dictionary-database').DatabaseTermMeta[], + * kanji: import('dictionary-database').DatabaseKanjiEntry[], + * kanjiMeta: import('dictionary-database').DatabaseKanjiMeta[], + * tagMeta: import('dictionary-database').Tag[], + * media: import('dictionary-database').MediaDataArrayBufferContent[], + * }>} + */ +async function createLegacyIndexedDbSnapshot(dictionaryDatabase) { + const db = getRequiredDb(dictionaryDatabase); + const dictionaries = await dictionaryDatabase.getDictionaryInfo(); + const terms = db.selectObjects(` + SELECT + dictionary, + expression, + reading, + expressionReverse, + readingReverse, + definitionTags, + termTags, + rules, + score, + glossaryJson, + sequence + FROM terms + ORDER BY rowid ASC + `).map((row) => ({ + dictionary: row.dictionary, + expression: row.expression, + reading: row.reading, + expressionReverse: row.expressionReverse, + readingReverse: row.readingReverse, + definitionTags: row.definitionTags, + termTags: row.termTags, + rules: row.rules, + score: row.score, + glossary: /** @type {import('dictionary-data').TermGlossary[]} */ (parseJson(row.glossaryJson)), + sequence: row.sequence, + })); + const termMeta = db.selectObjects('SELECT dictionary, expression, mode, dataJson FROM termMeta ORDER BY id ASC') + .map((row) => ({ + dictionary: row.dictionary, + expression: row.expression, + mode: row.mode, + data: parseJson(row.dataJson), + })); + const kanji = db.selectObjects('SELECT dictionary, character, onyomi, kunyomi, tags, meaningsJson, statsJson FROM kanji ORDER BY id ASC') + .map((row) => ({ + dictionary: row.dictionary, + character: row.character, + onyomi: row.onyomi, + kunyomi: row.kunyomi, + tags: row.tags, + meanings: /** @type {string[]} */ (parseJson(row.meaningsJson)), + stats: typeof row.statsJson === 'string' ? /** @type {{[name: string]: string}} */ (parseJson(row.statsJson)) : void 0, + })); + const kanjiMeta = db.selectObjects('SELECT dictionary, character, mode, dataJson FROM kanjiMeta ORDER BY id ASC') + .map((row) => ({ + dictionary: row.dictionary, + character: row.character, + mode: row.mode, + data: parseJson(row.dataJson), + })); + const tagMeta = db.selectObjects('SELECT dictionary, name, category, ord, notes, score FROM tagMeta ORDER BY id ASC') + .map((row) => ({ + dictionary: row.dictionary, + name: row.name, + category: row.category, + order: row.ord, + notes: row.notes, + score: row.score, + })); + const media = db.selectObjects('SELECT dictionary, path, mediaType, width, height, content FROM media ORDER BY id ASC') + .map((row) => ({ + dictionary: row.dictionary, + path: row.path, + mediaType: row.mediaType, + width: row.width, + height: row.height, + content: cloneArrayBuffer(row.content), + })); + return { + dictionaries, + terms, + termMeta, + kanji, + kanjiMeta, + tagMeta, + media, + }; +} + +/** + * @param {Awaited>} snapshot + * @returns {Promise} + */ +async function populateLegacyIndexedDb(snapshot) { + const legacyDatabase = new Database(); + await legacyDatabase.open(LEGACY_INDEXEDDB_NAME, 60, LEGACY_INDEXEDDB_STRUCTURE); + try { + for (const [storeName, rows] of Object.entries(snapshot)) { + if (!Array.isArray(rows) || rows.length === 0) { + continue; + } + await legacyDatabase.bulkAdd( + /** @type {import('dictionary-database').ObjectStoreName} */ (storeName), + rows, + 0, + rows.length, + ); + } + } finally { + legacyDatabase.close(); + } +} + +/** + * @returns {Promise} + */ +async function openLegacyIndexedDbIfPresent() { + return await new Promise((resolve, reject) => { + let createdNewDatabase = false; + const request = indexedDB.open(LEGACY_INDEXEDDB_NAME); + request.onupgradeneeded = (event) => { + createdNewDatabase = event.oldVersion === 0; + }; + request.onerror = () => { + reject(request.error ?? new Error(`Failed to open IndexedDB database '${LEGACY_INDEXEDDB_NAME}'`)); + }; + request.onsuccess = () => { + const db = request.result; + if (!createdNewDatabase && db.objectStoreNames.length > 0) { + resolve(db); + return; + } + db.close(); + const deleteRequest = indexedDB.deleteDatabase(LEGACY_INDEXEDDB_NAME); + deleteRequest.onsuccess = () => resolve(null); + deleteRequest.onerror = () => resolve(null); + deleteRequest.onblocked = () => resolve(null); + }; + }); +} + /** * @param {DictionaryDatabase} dictionaryDatabase * @returns {import('@sqlite.org/sqlite-wasm').Database} @@ -1551,6 +1784,99 @@ describe('Database', () => { } }); + test('Migrates dictionaries from legacy IndexedDB into SQLite on first startup', async ({expect}) => { + const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); + const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); + try { + const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1', 'Legacy Dictionary'); + const importDetails = {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}; + + const sourceDictionaryDatabase = new DictionaryDatabase(); + await sourceDictionaryDatabase.prepare(); + await createDictionaryImporter(expect).importDictionary(sourceDictionaryDatabase, testDictionarySource, importDetails); + + const expectedInfo = await sourceDictionaryDatabase.getDictionaryInfo(); + const expectedCounts = await sourceDictionaryDatabase.getDictionaryCounts(['Legacy Dictionary'], true); + const legacySnapshot = await createLegacyIndexedDbSnapshot(sourceDictionaryDatabase); + await sourceDictionaryDatabase.purge(); + await sourceDictionaryDatabase.close(); + + await populateLegacyIndexedDb(legacySnapshot); + const legacyDbBeforeMigration = await openLegacyIndexedDbIfPresent(); + expect.soft(legacyDbBeforeMigration).not.toBeNull(); + legacyDbBeforeMigration?.close(); + + const migratedDictionaryDatabase = new DictionaryDatabase(); + await migratedDictionaryDatabase.prepare(); + try { + const migratedInfo = await migratedDictionaryDatabase.getDictionaryInfo(); + const migratedCounts = await migratedDictionaryDatabase.getDictionaryCounts(['Legacy Dictionary'], true); + expect.soft(migratedInfo).toStrictEqual(expectedInfo); + expect.soft(migratedCounts).toStrictEqual(expectedCounts); + + const legacyDbAfterMigration = await openLegacyIndexedDbIfPresent(); + expect.soft(legacyDbAfterMigration).toBeNull(); + legacyDbAfterMigration?.close(); + } finally { + await migratedDictionaryDatabase.close(); + } + } finally { + restoreNavigator(); + } + }, 15000); + + test('Skips legacy IndexedDB migration when SQLite already has dictionaries', async ({expect}) => { + const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); + const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); + try { + const legacyDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1', 'Legacy Dictionary'); + const currentDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1', 'Current Dictionary'); + const importDetails = {prefixWildcardsSupported: true, yomitanVersion: '0.0.0.0'}; + + const legacySourceDatabase = new DictionaryDatabase(); + await legacySourceDatabase.prepare(); + await createDictionaryImporter(expect).importDictionary(legacySourceDatabase, legacyDictionarySource, importDetails); + const legacySnapshot = await createLegacyIndexedDbSnapshot(legacySourceDatabase); + await legacySourceDatabase.purge(); + await legacySourceDatabase.close(); + + const currentDictionaryDatabase = new DictionaryDatabase(); + await currentDictionaryDatabase.prepare(); + await createDictionaryImporter(expect).importDictionary(currentDictionaryDatabase, currentDictionarySource, importDetails); + const expectedInfo = await currentDictionaryDatabase.getDictionaryInfo(); + const expectedCounts = await currentDictionaryDatabase.getDictionaryCounts(['Current Dictionary'], true); + + await populateLegacyIndexedDb(legacySnapshot); + try { + const migrateLegacyIndexedDbIfNeeded = Reflect.get(currentDictionaryDatabase, '_migrateLegacyIndexedDbIfNeeded'); + if (typeof migrateLegacyIndexedDbIfNeeded !== 'function') { + throw new Error('Expected _migrateLegacyIndexedDbIfNeeded method'); + } + await Promise.resolve(migrateLegacyIndexedDbIfNeeded.call(currentDictionaryDatabase)); + + const deleteLegacyIndexedDb = Reflect.get(currentDictionaryDatabase, '_deleteLegacyIndexedDb'); + if (typeof deleteLegacyIndexedDb !== 'function') { + throw new Error('Expected _deleteLegacyIndexedDb method'); + } + await Promise.resolve(deleteLegacyIndexedDb.call(currentDictionaryDatabase)); + + const info = await currentDictionaryDatabase.getDictionaryInfo(); + const counts = await currentDictionaryDatabase.getDictionaryCounts(['Current Dictionary'], true); + expect.soft(info).toStrictEqual(expectedInfo); + expect.soft(info.map((item) => item.title)).toStrictEqual(['Current Dictionary']); + expect.soft(counts).toStrictEqual(expectedCounts); + + const legacyDbAfterPrepare = await openLegacyIndexedDbIfPresent(); + expect.soft(legacyDbAfterPrepare).toBeNull(); + legacyDbAfterPrepare?.close(); + } finally { + await currentDictionaryDatabase.close(); + } + } finally { + restoreNavigator(); + } + }, 15000); + test('Schema migration v1 wipes unversioned dictionary data and advances to current schema version', async ({expect}) => { const testDictionarySource = await createTestDictionaryArchiveData('valid-dictionary1'); const dictionaryDatabase = new DictionaryDatabase(); From 9d428938d2e9a212e48afa0e5c26d281714c4f28 Mon Sep 17 00:00:00 2001 From: Autumn Skerritt Date: Fri, 13 Mar 2026 12:10:40 +0000 Subject: [PATCH 3/3] Move legacy IndexedDB migration behind experimental settings action --- ext/js/background/backend.js | 16 +++ ext/js/background/offscreen-proxy.js | 14 ++ ext/js/background/offscreen.js | 12 ++ ext/js/comm/api.js | 14 ++ ext/js/dictionary/dictionary-database.js | 158 ++++++++++++++++----- ext/js/pages/settings/backup-controller.js | 152 ++++++++++++++++++-- ext/settings.html | 22 +++ test/database.test.js | 58 +++++--- types/ext/api.d.ts | 31 ++++ types/ext/backend.d.ts | 2 +- types/ext/offscreen.d.ts | 9 ++ 11 files changed, 426 insertions(+), 62 deletions(-) diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index 9c603bb564..e59fbd663b 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -170,6 +170,8 @@ export class Backend { ['getZoom', this._onApiGetZoom.bind(this)], ['getDefaultAnkiFieldTemplates', this._onApiGetDefaultAnkiFieldTemplates.bind(this)], ['getDictionaryInfo', this._onApiGetDictionaryInfo.bind(this)], + ['getLegacyIndexedDbMigrationStatus', this._onApiGetLegacyIndexedDbMigrationStatus.bind(this)], + ['migrateLegacyIndexedDb', this._onApiMigrateLegacyIndexedDb.bind(this)], ['exportDictionaryDatabase', this._onApiExportDictionaryDatabase.bind(this)], ['importDictionaryDatabase', this._onApiImportDictionaryDatabase.bind(this)], ['purgeDatabase', this._onApiPurgeDatabase.bind(this)], @@ -929,6 +931,20 @@ export class Backend { return await this._dictionaryDatabase.getDictionaryInfo(); } + /** @type {import('api').ApiHandler<'getLegacyIndexedDbMigrationStatus'>} */ + async _onApiGetLegacyIndexedDbMigrationStatus() { + return await this._dictionaryDatabase.getLegacyIndexedDbMigrationStatus(); + } + + /** @type {import('api').ApiHandler<'migrateLegacyIndexedDb'>} */ + async _onApiMigrateLegacyIndexedDb() { + const result = await this._dictionaryDatabase.migrateLegacyIndexedDb(); + if (result.result === 'migrated') { + this._triggerDatabaseUpdated('dictionary', 'migrate'); + } + return result; + } + /** @type {import('api').ApiHandler<'exportDictionaryDatabase'>} */ async _onApiExportDictionaryDatabase() { return await this._dictionaryDatabase.exportDatabase(); diff --git a/ext/js/background/offscreen-proxy.js b/ext/js/background/offscreen-proxy.js index 2a689440db..1b7b681677 100644 --- a/ext/js/background/offscreen-proxy.js +++ b/ext/js/background/offscreen-proxy.js @@ -190,6 +190,20 @@ export class DictionaryDatabaseProxy { return this._offscreen.sendMessagePromise({action: 'getDictionaryInfoOffscreen'}); } + /** + * @returns {Promise} + */ + async getLegacyIndexedDbMigrationStatus() { + return this._offscreen.sendMessagePromise({action: 'getLegacyIndexedDbMigrationStatusOffscreen'}); + } + + /** + * @returns {Promise} + */ + async migrateLegacyIndexedDb() { + return this._offscreen.sendMessagePromise({action: 'migrateLegacyIndexedDbOffscreen'}); + } + /** * @returns {Promise} */ diff --git a/ext/js/background/offscreen.js b/ext/js/background/offscreen.js index 4e535069ea..a60d555987 100644 --- a/ext/js/background/offscreen.js +++ b/ext/js/background/offscreen.js @@ -55,6 +55,8 @@ export class Offscreen { ['clipboardSetBrowserOffscreen', this._setClipboardBrowser.bind(this)], ['databasePrepareOffscreen', this._prepareDatabaseHandler.bind(this)], ['getDictionaryInfoOffscreen', this._getDictionaryInfoHandler.bind(this)], + ['getLegacyIndexedDbMigrationStatusOffscreen', this._getLegacyIndexedDbMigrationStatusHandler.bind(this)], + ['migrateLegacyIndexedDbOffscreen', this._migrateLegacyIndexedDbHandler.bind(this)], ['exportDictionaryDatabaseOffscreen', this._exportDictionaryDatabaseHandler.bind(this)], ['importDictionaryDatabaseOffscreen', this._importDictionaryDatabaseHandler.bind(this)], ['databasePurgeOffscreen', this._purgeDatabaseHandler.bind(this)], @@ -119,6 +121,16 @@ export class Offscreen { return await this._dictionaryDatabase.getDictionaryInfo(); } + /** @type {import('offscreen').ApiHandler<'getLegacyIndexedDbMigrationStatusOffscreen'>} */ + async _getLegacyIndexedDbMigrationStatusHandler() { + return await this._dictionaryDatabase.getLegacyIndexedDbMigrationStatus(); + } + + /** @type {import('offscreen').ApiHandler<'migrateLegacyIndexedDbOffscreen'>} */ + async _migrateLegacyIndexedDbHandler() { + return await this._dictionaryDatabase.migrateLegacyIndexedDb(); + } + /** @type {import('offscreen').ApiHandler<'exportDictionaryDatabaseOffscreen'>} */ async _exportDictionaryDatabaseHandler() { return arrayBufferToBase64(await this._dictionaryDatabase.exportDatabase()); diff --git a/ext/js/comm/api.js b/ext/js/comm/api.js index e7573b802f..408d2eed27 100644 --- a/ext/js/comm/api.js +++ b/ext/js/comm/api.js @@ -248,6 +248,20 @@ export class API { return this._invoke('getDictionaryInfo', void 0); } + /** + * @returns {Promise>} + */ + getLegacyIndexedDbMigrationStatus() { + return this._invoke('getLegacyIndexedDbMigrationStatus', void 0); + } + + /** + * @returns {Promise>} + */ + migrateLegacyIndexedDb() { + return this._invoke('migrateLegacyIndexedDb', void 0); + } + /** * @returns {Promise>} */ diff --git a/ext/js/dictionary/dictionary-database.js b/ext/js/dictionary/dictionary-database.js index 406059d1d4..7da1619784 100644 --- a/ext/js/dictionary/dictionary-database.js +++ b/ext/js/dictionary/dictionary-database.js @@ -287,8 +287,6 @@ export class DictionaryDatabase { await this._openConnection(); await initializeTermContentZstd(); this._termContentZstdInitialized = true; - await this._migrateLegacyIndexedDbIfNeeded(); - await this._deleteLegacyIndexedDb(); await this._cleanupIncompleteImports(); await this._cleanupMissingTermRecordShards(); @@ -690,6 +688,56 @@ export class DictionaryDatabase { return result; } + /** + * @returns {Promise<{ + * reason: 'available'|'indexeddb-unavailable'|'legacy-database-missing'|'legacy-database-empty'|'sqlite-not-empty', + * hasLegacyDatabase: boolean, + * hasLegacyData: boolean, + * sqliteEmpty: boolean, + * migrationAvailable: boolean, + * }>} + */ + async getLegacyIndexedDbMigrationStatus() { + const {legacyDb, status} = await this._getLegacyIndexedDbMigrationStatusDetails(); + legacyDb?.close(); + return status; + } + + /** + * @returns {Promise<{ + * result: 'migrated'|'skipped', + * reason: null|'indexeddb-unavailable'|'legacy-database-missing'|'legacy-database-empty'|'sqlite-not-empty', + * migratedRowsByStore: Partial>, + * totalRows: number, + * usedFallbackStorage: boolean, + * }>} + */ + async migrateLegacyIndexedDb() { + const {legacyDb, status} = await this._getLegacyIndexedDbMigrationStatusDetails(); + if (legacyDb === null || !status.migrationAvailable) { + legacyDb?.close(); + reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { + reason: status.reason, + }); + return { + result: 'skipped', + reason: status.reason, + migratedRowsByStore: {}, + totalRows: 0, + usedFallbackStorage: this._usesFallbackStorage, + }; + } + + let result; + try { + result = await this._migrateLegacyIndexedDb(legacyDb); + } finally { + legacyDb.close(); + } + await this._deleteLegacyIndexedDb(); + return result; + } + /** * @returns {Promise} */ @@ -3924,44 +3972,84 @@ export class DictionaryDatabase { } /** - * Migrates dictionaries from the legacy IndexedDB database on first SQLite startup. - * @returns {Promise} + * @returns {Promise<{ + * legacyDb: IDBDatabase|null, + * status: { + * reason: 'available'|'indexeddb-unavailable'|'legacy-database-missing'|'legacy-database-empty'|'sqlite-not-empty', + * hasLegacyDatabase: boolean, + * hasLegacyData: boolean, + * sqliteEmpty: boolean, + * migrationAvailable: boolean, + * }, + * }>} */ - async _migrateLegacyIndexedDbIfNeeded() { + async _getLegacyIndexedDbMigrationStatusDetails() { + const sqliteEmpty = this._isSqliteDictionaryStorageEmpty(); if (typeof indexedDB === 'undefined') { - return; - } - if (!this._isLegacyIndexedDbMigrationNeeded()) { - reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { - reason: 'sqlite-not-empty', - }); - return; + return { + legacyDb: null, + status: { + reason: 'indexeddb-unavailable', + hasLegacyDatabase: false, + hasLegacyData: false, + sqliteEmpty, + migrationAvailable: false, + }, + }; } - const legacyDb = await this._openLegacyIndexedDbIfPresent(); - if (legacyDb === null) { - reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { - reason: 'legacy-database-missing', - }); - return; + try { + const hasLegacyDatabase = legacyDb !== null; + let hasLegacyData = false; + if (legacyDb !== null && legacyDb.objectStoreNames.contains('dictionaries')) { + const dictionariesPreview = await this._readLegacyIndexedDbStoreBatch(legacyDb, 'dictionaries', null, 1); + hasLegacyData = dictionariesPreview.rows.length > 0; + } + + let reason = 'available'; + if (!hasLegacyDatabase) { + reason = 'legacy-database-missing'; + } else if (!hasLegacyData) { + reason = 'legacy-database-empty'; + } else if (!sqliteEmpty) { + reason = 'sqlite-not-empty'; + } + return { + legacyDb, + status: { + reason, + hasLegacyDatabase, + hasLegacyData, + sqliteEmpty, + migrationAvailable: reason === 'available', + }, + }; + } catch (e) { + legacyDb?.close(); + throw e; } + } + /** + * Migrates dictionaries from the legacy IndexedDB database into SQLite. + * @param {IDBDatabase} legacyDb + * @returns {Promise<{ + * result: 'migrated', + * reason: null, + * migratedRowsByStore: Partial>, + * totalRows: number, + * usedFallbackStorage: boolean, + * }>} + */ + async _migrateLegacyIndexedDb(legacyDb) { let bulkImportStarted = false; - /** @type {Record} */ + /** @type {Partial>} */ const migratedRowsByStore = {}; + let totalRows = 0; try { - const dictionariesPreview = await this._readLegacyIndexedDbStoreBatch(legacyDb, 'dictionaries', null, 1); - if (dictionariesPreview.rows.length === 0) { - reportDiagnostics('dictionary-legacy-indexeddb-migration-skipped', { - reason: 'legacy-database-empty', - }); - return; - } - await this.startBulkImport(); bulkImportStarted = true; - let totalRows = 0; for (const storeName of LEGACY_DICTIONARY_INDEXEDDB_STORES) { if (!legacyDb.objectStoreNames.contains(storeName)) { continue; @@ -3987,6 +4075,13 @@ export class DictionaryDatabase { migratedRowsByStore, usedFallbackStorage: this._usesFallbackStorage, }); + return { + result: 'migrated', + reason: null, + migratedRowsByStore, + totalRows, + usedFallbackStorage: this._usesFallbackStorage, + }; } catch (e) { const error = toError(e); if (bulkImportStarted) { @@ -4007,15 +4102,13 @@ export class DictionaryDatabase { error: error.message, }); throw new Error(`Failed to migrate legacy IndexedDB dictionaries: ${error.message}`); - } finally { - legacyDb.close(); } } /** * @returns {boolean} */ - _isLegacyIndexedDbMigrationNeeded() { + _isSqliteDictionaryStorageEmpty() { if (!this._termRecordStore.isEmpty()) { return false; } @@ -4129,9 +4222,10 @@ export class DictionaryDatabase { * @returns {import('dictionary-database').DatabaseTermEntry} */ _normalizeLegacyIndexedDbTermRow(row) { + const rawRow = /** @type {Record} */ (/** @type {import('core').SafeAny} */ (row)); /** @type {import('dictionary-database').DatabaseTermEntry} */ const migratedRow = { - .../** @type {import('core').SafeAny} */ (row), + ...rawRow, }; const expression = this._asString(migratedRow.expression); const reading = this._asString(migratedRow.reading); diff --git a/ext/js/pages/settings/backup-controller.js b/ext/js/pages/settings/backup-controller.js index 54fc069607..33b3a20a1b 100644 --- a/ext/js/pages/settings/backup-controller.js +++ b/ext/js/pages/settings/backup-controller.js @@ -53,7 +53,7 @@ export class BackupController { this._optionsUtil = null; /** @type {?import('core').TokenObject} */ - this._settingsExportDatabaseToken = null; + this._settingsDatabaseOperationToken = null; try { this._optionsUtil = new OptionsUtil(); @@ -86,6 +86,9 @@ export class BackupController { this._addNodeEventListener('#settings-export-db-button', 'click', this._onSettingsExportDatabaseClick.bind(this), false); this._addNodeEventListener('#settings-import-db-button', 'click', this._onSettingsImportDatabaseClick.bind(this), false); this._addNodeEventListener('#settings-import-db', 'change', this._onSettingsImportDatabaseChange.bind(this), false); + this._addNodeEventListener('#settings-migrate-legacy-indexeddb-button', 'click', this._onSettingsMigrateLegacyIndexedDbClick.bind(this), false); + this._settingsController.application.on('databaseUpdated', this._onDatabaseUpdated.bind(this)); + await this._updateLegacyIndexedDbMigrationUi(); } // Private @@ -583,6 +586,75 @@ export class BackupController { messageContainer.textContent = message; } + /** + * @param {import('api').LegacyIndexedDbMigrationReason} reason + * @returns {{message: string, color: string, enabled: boolean}} + */ + _getLegacyIndexedDbMigrationUiState(reason) { + switch (reason) { + case 'available': + return { + message: 'Legacy IndexedDB dictionary data was found and the current SQLite dictionary database is empty. You can try the experimental migration.', + color: '#8B6508', + enabled: true, + }; + case 'indexeddb-unavailable': + return { + message: 'IndexedDB is unavailable in this context, so the experimental migration cannot run here.', + color: '#8B0000', + enabled: false, + }; + case 'legacy-database-empty': + return { + message: 'A legacy IndexedDB dictionary database was found, but it does not contain any dictionaries to migrate.', + color: '#666666', + enabled: false, + }; + case 'sqlite-not-empty': + return { + message: 'The current SQLite dictionary database already contains dictionaries. This experimental migration only runs when the SQLite dictionary database is empty.', + color: '#8B0000', + enabled: false, + }; + default: + return { + message: 'No legacy IndexedDB dictionary database was detected.', + color: '#666666', + enabled: false, + }; + } + } + + /** */ + async _updateLegacyIndexedDbMigrationUi() { + const button = document.querySelector('#settings-migrate-legacy-indexeddb-button'); + const statusNode = document.querySelector('#settings-legacy-indexeddb-migration-status'); + if (!(button instanceof HTMLButtonElement) || !(statusNode instanceof HTMLElement)) { + return; + } + + try { + const status = await this._settingsController.application.api.getLegacyIndexedDbMigrationStatus(); + const {message, color, enabled} = this._getLegacyIndexedDbMigrationUiState(status.reason); + statusNode.textContent = message; + statusNode.style.color = color; + button.disabled = !enabled || this._settingsDatabaseOperationToken !== null; + } catch (error) { + log.error(error); + statusNode.textContent = 'Unable to determine whether a legacy IndexedDB dictionary database is available.'; + statusNode.style.color = '#8B0000'; + button.disabled = true; + } + } + + /** */ + _clearDatabaseOperationError() { + const errorMessageContainer = document.querySelector('#db-ops-error-report'); + if (errorMessageContainer instanceof HTMLElement) { + errorMessageContainer.style.display = 'none'; + } + } + /** * @returns {Promise} */ @@ -592,22 +664,21 @@ export class BackupController { /** */ async _onSettingsExportDatabaseClick() { - if (this._settingsExportDatabaseToken !== null) { + if (this._settingsDatabaseOperationToken !== null) { // An existing import or export is in progress. this._databaseExportImportErrorMessage('An export or import operation is already in progress. Please wait till it is over.', true); return; } - /** @type {HTMLElement} */ - const errorMessageContainer = querySelectorNotNull(document, '#db-ops-error-report'); - errorMessageContainer.style.display = 'none'; + this._clearDatabaseOperationError(); const date = new Date(Date.now()); const pageExitPrevention = this._settingsController.preventPageExit(); try { /** @type {import('core').TokenObject} */ const token = {}; - this._settingsExportDatabaseToken = token; + this._settingsDatabaseOperationToken = token; + await this._updateLegacyIndexedDbMigrationUi(); this._setDatabaseExportImportStatus('Exporting dictionary collection...'); const fileName = `yomitan-dictionaries-${this._getSettingsExportDateString(date, '-', '-', '-', 6)}.sqlite3`; const data = await this._exportDatabase(); @@ -620,7 +691,8 @@ export class BackupController { this._databaseExportImportErrorMessage('Errors encountered while exporting. Please try again. Restart the browser if it continues to fail.'); } finally { pageExitPrevention.end(); - this._settingsExportDatabaseToken = null; + this._settingsDatabaseOperationToken = null; + await this._updateLegacyIndexedDbMigrationUi(); } } @@ -644,15 +716,13 @@ export class BackupController { * @param {Event} e */ async _onSettingsImportDatabaseChange(e) { - if (this._settingsExportDatabaseToken !== null) { + if (this._settingsDatabaseOperationToken !== null) { // An existing import or export is in progress. this._databaseExportImportErrorMessage('An export or import operation is already in progress. Please wait till it is over.', true); return; } - /** @type {HTMLElement} */ - const errorMessageContainer = querySelectorNotNull(document, '#db-ops-error-report'); - errorMessageContainer.style.display = 'none'; + this._clearDatabaseOperationError(); const element = /** @type {HTMLInputElement} */ (e.currentTarget); const files = element.files; @@ -664,7 +734,8 @@ export class BackupController { try { /** @type {import('core').TokenObject} */ const token = {}; - this._settingsExportDatabaseToken = token; + this._settingsDatabaseOperationToken = token; + await this._updateLegacyIndexedDbMigrationUi(); this._setDatabaseExportImportStatus('Importing dictionary collection...'); await this._importDatabase(file); this._setDatabaseExportImportStatus( @@ -677,7 +748,62 @@ export class BackupController { this._databaseExportImportErrorMessage('Encountered errors when importing. Please restart the browser and try again. If it continues to fail, reinstall Yomitan and import dictionaries one-by-one.'); } finally { pageExitPrevention.end(); - this._settingsExportDatabaseToken = null; + this._settingsDatabaseOperationToken = null; + await this._updateLegacyIndexedDbMigrationUi(); } } + + /** */ + async _onSettingsMigrateLegacyIndexedDbClick() { + if (this._settingsDatabaseOperationToken !== null) { + this._databaseExportImportErrorMessage('An export, import, or migration operation is already in progress. Please wait until it finishes.', true); + return; + } + + this._clearDatabaseOperationError(); + + const pageExitPrevention = this._settingsController.preventPageExit(); + try { + const status = await this._settingsController.application.api.getLegacyIndexedDbMigrationStatus(); + if (!status.migrationAvailable) { + const {message} = this._getLegacyIndexedDbMigrationUiState(status.reason); + this._databaseExportImportErrorMessage(message, true); + await this._updateLegacyIndexedDbMigrationUi(); + return; + } + + /** @type {import('core').TokenObject} */ + const token = {}; + this._settingsDatabaseOperationToken = token; + await this._updateLegacyIndexedDbMigrationUi(); + this._setDatabaseExportImportStatus('Running experimental legacy IndexedDB migration...'); + + const result = await this._settingsController.application.api.migrateLegacyIndexedDb(); + if (result.result !== 'migrated') { + const {message} = this._getLegacyIndexedDbMigrationUiState(result.reason ?? 'legacy-database-missing'); + this._setDatabaseExportImportStatus('', '#4169e1', true); + this._databaseExportImportErrorMessage(message); + return; + } + + this._settingsController.application.triggerStorageChanged(); + this._setDatabaseExportImportStatus( + `Done migrating ${result.totalRows} legacy IndexedDB rows into SQLite. Review your dictionaries and export a backup if everything looks correct.`, + '#006633', + ); + } catch (error) { + log.log(error); + this._setDatabaseExportImportStatus('', '#4169e1', true); + this._databaseExportImportErrorMessage('Encountered errors while running the experimental migration. Please restart the browser and verify your dictionaries before trying again.'); + } finally { + pageExitPrevention.end(); + this._settingsDatabaseOperationToken = null; + await this._updateLegacyIndexedDbMigrationUi(); + } + } + + /** */ + _onDatabaseUpdated() { + void this._updateLegacyIndexedDbMigrationUi(); + } } diff --git a/ext/settings.html b/ext/settings.html index d0b756381f..a7dfc941f3 100644 --- a/ext/settings.html +++ b/ext/settings.html @@ -2342,6 +2342,28 @@

Yomitan Settings

+
+
+
+ Experimental: migrate legacy IndexedDB dictionaries to SQLite +
+
+ This is an experimental recovery path for older dictionary data. It might break your dictionary collection. + Export your current dictionary collection first, and only use this when you need to recover dictionaries + from a legacy IndexedDB-backed Yomitan installation. +
+
+ Checking whether a legacy IndexedDB dictionary database is available... +
+
+
+
+
+ +
+
+
+
diff --git a/test/database.test.js b/test/database.test.js index 76f2565953..ac43e52d22 100644 --- a/test/database.test.js +++ b/test/database.test.js @@ -298,11 +298,15 @@ function concatUint8Arrays(chunks) { */ function cloneArrayBuffer(value) { if (value instanceof ArrayBuffer) { - return value.slice(0); + const copy = new Uint8Array(value.byteLength); + copy.set(new Uint8Array(value)); + return copy.buffer; } if (ArrayBuffer.isView(value)) { const typedArray = /** @type {ArrayBufferView} */ (value); - return typedArray.buffer.slice(typedArray.byteOffset, typedArray.byteOffset + typedArray.byteLength); + const copy = new Uint8Array(typedArray.byteLength); + copy.set(new Uint8Array(typedArray.buffer, typedArray.byteOffset, typedArray.byteLength)); + return copy.buffer; } return new ArrayBuffer(0); } @@ -1784,7 +1788,7 @@ describe('Database', () => { } }); - test('Migrates dictionaries from legacy IndexedDB into SQLite on first startup', async ({expect}) => { + test('Migrates dictionaries from legacy IndexedDB into SQLite only when requested', async ({expect}) => { const opfsRootDirectoryHandle = createInMemoryOpfsDirectoryHandle(); const restoreNavigator = installInMemoryOpfsNavigator(opfsRootDirectoryHandle); try { @@ -1809,6 +1813,22 @@ describe('Database', () => { const migratedDictionaryDatabase = new DictionaryDatabase(); await migratedDictionaryDatabase.prepare(); try { + const statusBeforeMigration = await migratedDictionaryDatabase.getLegacyIndexedDbMigrationStatus(); + expect.soft(statusBeforeMigration).toStrictEqual({ + reason: 'available', + hasLegacyDatabase: true, + hasLegacyData: true, + sqliteEmpty: true, + migrationAvailable: true, + }); + + const infoBeforeMigration = await migratedDictionaryDatabase.getDictionaryInfo(); + expect.soft(infoBeforeMigration).toStrictEqual([]); + + const migrationResult = await migratedDictionaryDatabase.migrateLegacyIndexedDb(); + expect.soft(migrationResult.result).toBe('migrated'); + expect.soft(migrationResult.reason).toBeNull(); + const migratedInfo = await migratedDictionaryDatabase.getDictionaryInfo(); const migratedCounts = await migratedDictionaryDatabase.getDictionaryCounts(['Legacy Dictionary'], true); expect.soft(migratedInfo).toStrictEqual(expectedInfo); @@ -1848,17 +1868,23 @@ describe('Database', () => { await populateLegacyIndexedDb(legacySnapshot); try { - const migrateLegacyIndexedDbIfNeeded = Reflect.get(currentDictionaryDatabase, '_migrateLegacyIndexedDbIfNeeded'); - if (typeof migrateLegacyIndexedDbIfNeeded !== 'function') { - throw new Error('Expected _migrateLegacyIndexedDbIfNeeded method'); - } - await Promise.resolve(migrateLegacyIndexedDbIfNeeded.call(currentDictionaryDatabase)); + const statusBeforeMigration = await currentDictionaryDatabase.getLegacyIndexedDbMigrationStatus(); + expect.soft(statusBeforeMigration).toStrictEqual({ + reason: 'sqlite-not-empty', + hasLegacyDatabase: true, + hasLegacyData: true, + sqliteEmpty: false, + migrationAvailable: false, + }); - const deleteLegacyIndexedDb = Reflect.get(currentDictionaryDatabase, '_deleteLegacyIndexedDb'); - if (typeof deleteLegacyIndexedDb !== 'function') { - throw new Error('Expected _deleteLegacyIndexedDb method'); - } - await Promise.resolve(deleteLegacyIndexedDb.call(currentDictionaryDatabase)); + const migrationResult = await currentDictionaryDatabase.migrateLegacyIndexedDb(); + expect.soft(migrationResult).toMatchObject({ + result: 'skipped', + reason: 'sqlite-not-empty', + migratedRowsByStore: {}, + totalRows: 0, + }); + expect.soft(typeof migrationResult.usedFallbackStorage).toBe('boolean'); const info = await currentDictionaryDatabase.getDictionaryInfo(); const counts = await currentDictionaryDatabase.getDictionaryCounts(['Current Dictionary'], true); @@ -1866,9 +1892,9 @@ describe('Database', () => { expect.soft(info.map((item) => item.title)).toStrictEqual(['Current Dictionary']); expect.soft(counts).toStrictEqual(expectedCounts); - const legacyDbAfterPrepare = await openLegacyIndexedDbIfPresent(); - expect.soft(legacyDbAfterPrepare).toBeNull(); - legacyDbAfterPrepare?.close(); + const legacyDbAfterAttempt = await openLegacyIndexedDbIfPresent(); + expect.soft(legacyDbAfterAttempt).not.toBeNull(); + legacyDbAfterAttempt?.close(); } finally { await currentDictionaryDatabase.close(); } diff --git a/types/ext/api.d.ts b/types/ext/api.d.ts index 3abe9a8d35..7080d82a08 100644 --- a/types/ext/api.d.ts +++ b/types/ext/api.d.ts @@ -121,6 +121,29 @@ export type GetTermFrequenciesDetailsTermReadingListItem = { reading: string | null; }; +export type LegacyIndexedDbMigrationReason = + 'available' | + 'indexeddb-unavailable' | + 'legacy-database-missing' | + 'legacy-database-empty' | + 'sqlite-not-empty'; + +export type LegacyIndexedDbMigrationStatus = { + reason: LegacyIndexedDbMigrationReason; + hasLegacyDatabase: boolean; + hasLegacyData: boolean; + sqliteEmpty: boolean; + migrationAvailable: boolean; +}; + +export type LegacyIndexedDbMigrationResult = { + result: 'migrated' | 'skipped'; + reason: LegacyIndexedDbMigrationReason | null; + migratedRowsByStore: Partial>; + totalRows: number; + usedFallbackStorage: boolean; +}; + type ApiSurface = { applicationReady: { params: void; @@ -291,6 +314,14 @@ type ApiSurface = { params: void; return: DictionaryImporter.Summary[]; }; + getLegacyIndexedDbMigrationStatus: { + params: void; + return: LegacyIndexedDbMigrationStatus; + }; + migrateLegacyIndexedDb: { + params: void; + return: LegacyIndexedDbMigrationResult; + }; exportDictionaryDatabase: { params: void; return: ArrayBuffer; diff --git a/types/ext/backend.d.ts b/types/ext/backend.d.ts index 049b5d8bc3..8931dc7a43 100644 --- a/types/ext/backend.d.ts +++ b/types/ext/backend.d.ts @@ -19,7 +19,7 @@ import type * as Api from './api'; export type DatabaseUpdateType = 'dictionary'; -export type DatabaseUpdateCause = 'purge' | 'delete' | 'import'; +export type DatabaseUpdateCause = 'purge' | 'delete' | 'import' | 'migrate'; export type MecabParseResults = [ dictionary: string, diff --git a/types/ext/offscreen.d.ts b/types/ext/offscreen.d.ts index 0914298855..8f42094033 100644 --- a/types/ext/offscreen.d.ts +++ b/types/ext/offscreen.d.ts @@ -15,6 +15,7 @@ * along with this program. If not, see . */ +import type * as Api from './api'; import type * as Dictionary from './dictionary'; import type * as DictionaryDatabase from './dictionary-database'; import type * as DictionaryImporter from './dictionary-importer'; @@ -39,6 +40,14 @@ type ApiSurface = { params: void; return: DictionaryImporter.Summary[]; }; + getLegacyIndexedDbMigrationStatusOffscreen: { + params: void; + return: Api.LegacyIndexedDbMigrationStatus; + }; + migrateLegacyIndexedDbOffscreen: { + params: void; + return: Api.LegacyIndexedDbMigrationResult; + }; exportDictionaryDatabaseOffscreen: { params: void; return: string;