diff --git a/ext/js/dictionary/dictionary-importer.js b/ext/js/dictionary/dictionary-importer.js index 4a37dd33b6..273871d3a1 100644 --- a/ext/js/dictionary/dictionary-importer.js +++ b/ext/js/dictionary/dictionary-importer.js @@ -26,6 +26,8 @@ import { } from '../../lib/zip.js'; import {ExtensionError} from '../core/extension-error.js'; import {parseJson} from '../core/json.js'; +import {log} from '../core/log.js'; +import {safePerformance} from '../core/safe-performance.js'; import {toError} from '../core/to-error.js'; import {stringReverse} from '../core/utilities.js'; import {getFileExtensionFromImageMediaType, getImageMediaTypeFromFileName} from '../media/media-util.js'; @@ -67,6 +69,8 @@ export class DictionaryImporter { throw new Error('Database is not ready'); } + const importStartTime = safePerformance.now(); + /** @type {Error[]} */ const errors = []; const maxTransactionLength = 1000; @@ -78,23 +82,10 @@ export class DictionaryImporter { * @param {import('dictionary-database').ObjectStoreData[]} entries */ const bulkAdd = async (objectStoreName, entries) => { - const entryCount = entries.length; - - let progressIndexIncrease = bulkAddProgressAllowance / Math.ceil(entryCount / maxTransactionLength); - if (entryCount < maxTransactionLength) { progressIndexIncrease = bulkAddProgressAllowance; } - if (entryCount === 0) { this._progressData.index += progressIndexIncrease; } - - for (let i = 0; i < entryCount; i += maxTransactionLength) { - const count = Math.min(maxTransactionLength, entryCount - i); - - try { - await dictionaryDatabase.bulkAdd(objectStoreName, entries, i, count); - } catch (e) { - errors.push(toError(e)); - } - - this._progressData.index += progressIndexIncrease; - this._progress(); + try { + await dictionaryDatabase.bulkAdd(objectStoreName, entries, 0, entries.length); + } catch (e) { + errors.push(toError(e)); } }; @@ -140,13 +131,14 @@ export class DictionaryImporter { // Load data const prefixWildcardsSupported = !!details.prefixWildcardsSupported; - this._progressNextStep(termFiles.length + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length); + const validationFileCount = termFiles.length + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length; + this._progressNextStep(validationFileCount * bulkAddProgressAllowance); - for (const termFile of termFiles) { await this._validateFile(termFile, dataBankSchemas[0]); } - for (const termMetaFile of termMetaFiles) { await this._validateFile(termMetaFile, dataBankSchemas[1]); } - for (const kanjiFile of kanjiFiles) { await this._validateFile(kanjiFile, dataBankSchemas[2]); } - for (const kanjiMetaFile of kanjiMetaFiles) { await this._validateFile(kanjiMetaFile, dataBankSchemas[3]); } - for (const tagFile of tagFiles) { await this._validateFile(tagFile, dataBankSchemas[4]); } + for (const termFile of termFiles) { await this._validateFile(termFile, dataBankSchemas[0], maxTransactionLength, bulkAddProgressAllowance); } + for (const termMetaFile of termMetaFiles) { await this._validateFile(termMetaFile, dataBankSchemas[1], maxTransactionLength, bulkAddProgressAllowance); } + for (const kanjiFile of kanjiFiles) { await this._validateFile(kanjiFile, dataBankSchemas[2], maxTransactionLength, bulkAddProgressAllowance); } + for (const kanjiMetaFile of kanjiMetaFiles) { await this._validateFile(kanjiMetaFile, dataBankSchemas[3], maxTransactionLength, bulkAddProgressAllowance); } + for (const tagFile of tagFiles) { await this._validateFile(tagFile, dataBankSchemas[4], maxTransactionLength, bulkAddProgressAllowance); } // termFiles is doubled due to media importing this._progressNextStep((termFiles.length * 2 + termMetaFiles.length + kanjiFiles.length + kanjiMetaFiles.length + tagFiles.length) * bulkAddProgressAllowance); @@ -178,112 +170,92 @@ export class DictionaryImporter { try { const uniqueMediaPaths = new Set(); for (const termFile of termFiles) { - /** @type {import('dictionary-importer').ImportRequirement[]} */ - const requirements = []; - let termList = await ( - version === 1 ? - this._readFileSequence([termFile], this._convertTermBankEntryV1.bind(this), dictionaryTitle) : - this._readFileSequence([termFile], this._convertTermBankEntryV3.bind(this), dictionaryTitle) - ); - - // Prefix wildcard support - if (prefixWildcardsSupported) { - for (const entry of termList) { - entry.expressionReverse = stringReverse(entry.expression); - entry.readingReverse = stringReverse(entry.reading); - } - } - - // Extended data support - for (let i = 0, ii = termList.length; i < ii; ++i) { - const entry = termList[i]; - const glossaryList = entry.glossary; - for (let j = 0, jj = glossaryList.length; j < jj; ++j) { - const glossary = glossaryList[j]; - if (typeof glossary !== 'object' || glossary === null || Array.isArray(glossary)) { continue; } - glossaryList[j] = this._formatDictionaryTermGlossaryObject(glossary, entry, requirements); + /** @type {(batch: import('dictionary-database').DatabaseTermEntry[]) => Promise} */ + const onTermBatch = async (batch) => { + /** @type {import('dictionary-importer').ImportRequirement[]} */ + const requirements = []; + + for (const entry of batch) { + if (prefixWildcardsSupported) { + entry.expressionReverse = stringReverse(entry.expression); + entry.readingReverse = stringReverse(entry.reading); + } + const glossaryList = entry.glossary; + for (let j = 0, jj = glossaryList.length; j < jj; ++j) { + const glossary = glossaryList[j]; + if (typeof glossary !== 'object' || glossary === null || Array.isArray(glossary)) { continue; } + glossaryList[j] = this._formatDictionaryTermGlossaryObject(glossary, entry, requirements); + } } - } - - const alreadyAddedRequirements = requirements.filter((x) => { return uniqueMediaPaths.has(x.source.path); }); - const notAddedRequirements = requirements.filter((x) => { return !uniqueMediaPaths.has(x.source.path); }); - for (const requirement of requirements) { uniqueMediaPaths.add(requirement.source.path); } - - await this._resolveAsyncRequirements(alreadyAddedRequirements, fileMap); // already added must also be resolved for the term dict to have correct data - let {media} = await this._resolveAsyncRequirements(notAddedRequirements, fileMap); - await bulkAdd('media', media); - counts.media.total += media.length; - - this._progress(); - await bulkAdd('terms', termList); - counts.terms.total += termList.length; + const alreadyAddedRequirements = requirements.filter((x) => { return uniqueMediaPaths.has(x.source.path); }); + const notAddedRequirements = requirements.filter((x) => { return !uniqueMediaPaths.has(x.source.path); }); + for (const requirement of requirements) { uniqueMediaPaths.add(requirement.source.path); } - this._progress(); + await this._resolveAsyncRequirements(alreadyAddedRequirements, fileMap); + const {media} = await this._resolveAsyncRequirements(notAddedRequirements, fileMap); + await bulkAdd('media', media); + counts.media.total += media.length; - termList = []; - media = []; + await bulkAdd('terms', batch); + counts.terms.total += batch.length; + }; + await (version === 1 ? + this._readFileSequenceStreaming(termFile, this._convertTermBankEntryV1.bind(this), dictionaryTitle, onTermBatch, maxTransactionLength, 2 * bulkAddProgressAllowance) : + this._readFileSequenceStreaming(termFile, this._convertTermBankEntryV3.bind(this), dictionaryTitle, onTermBatch, maxTransactionLength, 2 * bulkAddProgressAllowance) + ); } for (const termMetaFile of termMetaFiles) { - let termMetaList = await this._readFileSequence([termMetaFile], this._convertTermMetaBankEntry.bind(this), dictionaryTitle); - - await bulkAdd('termMeta', termMetaList); - for (const [key, value] of Object.entries(this._getMetaCounts(termMetaList))) { - if (key in counts.termMeta) { - counts.termMeta[key] += value; - } else { - counts.termMeta[key] = value; + /** @type {(batch: import('dictionary-database').DatabaseTermMeta[]) => Promise} */ + const onTermMetaBatch = async (batch) => { + await bulkAdd('termMeta', batch); + for (const [key, value] of Object.entries(this._getMetaCounts(batch))) { + if (key in counts.termMeta) { + counts.termMeta[key] += value; + } else { + counts.termMeta[key] = value; + } } - } - - this._progress(); - - termMetaList = []; + }; + await this._readFileSequenceStreaming(termMetaFile, this._convertTermMetaBankEntry.bind(this), dictionaryTitle, onTermMetaBatch, maxTransactionLength, bulkAddProgressAllowance); } for (const kanjiFile of kanjiFiles) { - let kanjiList = await ( - version === 1 ? - this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV1.bind(this), dictionaryTitle) : - this._readFileSequence([kanjiFile], this._convertKanjiBankEntryV3.bind(this), dictionaryTitle) + /** @type {(batch: import('dictionary-database').DatabaseKanjiEntry[]) => Promise} */ + const onKanjiBatch = async (batch) => { + await bulkAdd('kanji', batch); + counts.kanji.total += batch.length; + }; + await (version === 1 ? + this._readFileSequenceStreaming(kanjiFile, this._convertKanjiBankEntryV1.bind(this), dictionaryTitle, onKanjiBatch, maxTransactionLength, bulkAddProgressAllowance) : + this._readFileSequenceStreaming(kanjiFile, this._convertKanjiBankEntryV3.bind(this), dictionaryTitle, onKanjiBatch, maxTransactionLength, bulkAddProgressAllowance) ); - - await bulkAdd('kanji', kanjiList); - counts.kanji.total += kanjiList.length; - - this._progress(); - - kanjiList = []; } for (const kanjiMetaFile of kanjiMetaFiles) { - let kanjiMetaList = await this._readFileSequence([kanjiMetaFile], this._convertKanjiMetaBankEntry.bind(this), dictionaryTitle); - - await bulkAdd('kanjiMeta', kanjiMetaList); - for (const [key, value] of Object.entries(this._getMetaCounts(kanjiMetaList))) { - if (key in counts.kanjiMeta) { - counts.kanjiMeta[key] += value; - } else { - counts.kanjiMeta[key] = value; + /** @type {(batch: import('dictionary-database').DatabaseKanjiMeta[]) => Promise} */ + const onKanjiMetaBatch = async (batch) => { + await bulkAdd('kanjiMeta', batch); + for (const [key, value] of Object.entries(this._getMetaCounts(batch))) { + if (key in counts.kanjiMeta) { + counts.kanjiMeta[key] += value; + } else { + counts.kanjiMeta[key] = value; + } } - } - - this._progress(); - - kanjiMetaList = []; + }; + await this._readFileSequenceStreaming(kanjiMetaFile, this._convertKanjiMetaBankEntry.bind(this), dictionaryTitle, onKanjiMetaBatch, maxTransactionLength, bulkAddProgressAllowance); } for (const tagFile of tagFiles) { - let tagList = await this._readFileSequence([tagFile], this._convertTagBankEntry.bind(this), dictionaryTitle); - this._addOldIndexTags(index, tagList, dictionaryTitle); - - await bulkAdd('tagMeta', tagList); - counts.tagMeta.total += tagList.length; - - this._progress(); - - tagList = []; + /** @type {(batch: import('dictionary-database').Tag[]) => Promise} */ + const onTagBatch = async (batch) => { + this._addOldIndexTags(index, batch, dictionaryTitle); + await bulkAdd('tagMeta', batch); + counts.tagMeta.total += batch.length; + }; + await this._readFileSequenceStreaming(tagFile, this._convertTagBankEntry.bind(this), dictionaryTitle, onTagBatch, maxTransactionLength, bulkAddProgressAllowance); } importSuccess = true; @@ -315,6 +287,8 @@ export class DictionaryImporter { this._progress(); + log.log(`Dictionary import took ${((safePerformance.now() - importStartTime) / 1000).toFixed(2)}s`); + return {result: summary, errors}; } @@ -951,31 +925,233 @@ export class DictionaryImporter { } /** + * Streams a file from the archive using streaming decompression and a + * bracket-depth JSON scanner, calling onEntry for each parsed top-level + * array element. Never holds the full decompressed string or parsed array + * in memory. * @param {import('@zip.js/zip.js').Entry} file - * @param {import('dictionary-importer').CompiledSchemaName} schemaName - * @returns {Promise} + * @param {(entry: unknown) => void | Promise} onEntry + * @param {((fraction: number) => void) | null} [onProgress] + * @returns {Promise} */ - async _validateFile(file, schemaName) { - const content = await this._getData(file, new TextWriter()); - let entries; + async _forEachStreamedEntry(file, onEntry, onProgress = null) { + if (typeof file.getData === 'undefined') { + throw new Error(`Cannot read ${file.filename}`); + } - try { - /** @type {unknown} */ - entries = parseJson(content); - } catch (error) { - if (error instanceof Error) { - throw new Error(error.message + ` in '${file.filename}'`); + const {readable, writable} = new TransformStream(); + const dataPromise = file.getData(writable); + + const totalBytes = file.uncompressedSize; + let bytesRead = 0; + const countingStream = new TransformStream({ + transform(/** @type {Uint8Array} */ chunk, /** @type {TransformStreamDefaultController} */ controller) { + bytesRead += chunk.byteLength; + controller.enqueue(chunk); + }, + }); + + const textStream = readable.pipeThrough(countingStream).pipeThrough(new TextDecoderStream()); + const reader = textStream.getReader(); + + // Bracket-depth scanner state + let depth = 0; + let inString = false; + let escape = false; + let entryStart = -1; + let accumulated = ''; + let hasTopLevelArray = false; + let needsComma = false; + + for (;;) { + const {done, value} = await reader.read(); + if (done) { break; } + + const text = /** @type {string} */ (value); + for (let i = 0, ii = text.length; i < ii; i++) { + const ch = text.charCodeAt(i); + + if (escape) { + escape = false; + continue; + } + + if (inString) { + if (ch === 0x5C) { // backslash + escape = true; + } else if (ch === 0x22) { // double quote + inString = false; + } + continue; + } + + // At depth 0, only whitespace and the opening [ are valid + if (depth === 0 && ch !== 0x20 && ch !== 0x09 && ch !== 0x0A && ch !== 0x0D && (hasTopLevelArray || ch !== 0x5B)) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + + switch (ch) { + case 0x22: // " + if (depth === 1) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + inString = true; + break; + case 0x5B: // [ + depth++; + if (depth === 1) { + hasTopLevelArray = true; + } else if (depth === 2) { + if (needsComma) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + entryStart = i; + accumulated = ''; + } + break; + case 0x7B: // { + if (depth <= 1) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + depth++; + break; + case 0x5D: // ] + depth--; + if (depth === 1) { + accumulated += text.substring(entryStart, i + 1); + let parsed; + try { + parsed = parseJson(accumulated); + } catch (error) { + if (error instanceof Error) { + throw new Error(error.message + ` in '${file.filename}'`); + } + throw error; + } + await onEntry(parsed); + entryStart = -1; + needsComma = true; + } + break; + case 0x7D: // } + if (depth <= 1) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + depth--; + break; + default: + // At depth 1, only whitespace (0x20 space, 0x09 tab, 0x0A LF, 0x0D CR) and commas are valid between entries + if (depth === 1) { + if (ch === 0x2C) { + if (!needsComma) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + needsComma = false; + } else if (ch !== 0x20 && ch !== 0x09 && ch !== 0x0A && ch !== 0x0D) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); + } + } + break; + } + } + + // Accumulate remaining text if mid-entry at chunk boundary + if (entryStart >= 0) { + accumulated += text.substring(entryStart); + entryStart = 0; + } + + if (onProgress !== null && totalBytes > 0) { + onProgress(bytesRead / totalBytes); } } - const schema = ajvSchemas[schemaName]; - if (!schema(entries)) { - throw this._formatAjvSchemaError(schema, file.filename); + if (!hasTopLevelArray || depth !== 0) { + throw new Error(`Dictionary has invalid data in '${file.filename}'`); } - ++this._progressData.index; - this._progress(); + await dataPromise; + } + /** + * Reads a single file from the archive using streaming decompression, + * converting and flushing entries in batches. + * @template [TEntry=unknown] + * @template [TResult=unknown] + * @param {import('@zip.js/zip.js').Entry} file + * @param {(entry: TEntry, title: string) => TResult} convertEntry + * @param {string} dictionaryTitle + * @param {(batch: TResult[]) => Promise} onBatch + * @param {number} batchSize + * @param {number} progressBudget + * @returns {Promise} + */ + async _readFileSequenceStreaming(file, convertEntry, dictionaryTitle, onBatch, batchSize, progressBudget) { + /** @type {TResult[]} */ + let batch = []; + let progressAdded = 0; + await this._forEachStreamedEntry(file, async (entry) => { + batch.push(convertEntry(/** @type {TEntry} */ (entry), dictionaryTitle)); + if (batch.length >= batchSize) { + await onBatch(batch); + batch = []; + } + }, (fraction) => { + const target = Math.floor(fraction * progressBudget); + const increment = target - progressAdded; + if (increment > 0) { + this._progressData.index += increment; + progressAdded += increment; + this._progress(); + } + }); + if (batch.length > 0) { + await onBatch(batch); + } + const remaining = progressBudget - progressAdded; + if (remaining > 0) { + this._progressData.index += remaining; + this._progress(); + } + } + + /** + * @param {import('@zip.js/zip.js').Entry} file + * @param {import('dictionary-importer').CompiledSchemaName} schemaName + * @param {number} batchSize + * @param {number} progressBudget + * @returns {Promise} + */ + async _validateFile(file, schemaName, batchSize, progressBudget) { + const schema = ajvSchemas[schemaName]; + /** @type {unknown[]} */ + let batch = []; + let progressAdded = 0; + await this._forEachStreamedEntry(file, (entry) => { + batch.push(entry); + if (batch.length >= batchSize) { + if (!schema(batch)) { + throw this._formatAjvSchemaError(schema, file.filename); + } + batch = []; + } + }, (fraction) => { + const target = Math.floor(fraction * progressBudget); + const increment = target - progressAdded; + if (increment > 0) { + this._progressData.index += increment; + progressAdded += increment; + this._progress(); + } + }); + if (batch.length > 0 && !schema(batch)) { + throw this._formatAjvSchemaError(schema, file.filename); + } + const remaining = progressBudget - progressAdded; + if (remaining > 0) { + this._progressData.index += remaining; + this._progress(); + } return true; } diff --git a/package.json b/package.json index 0f1d1479fb..6f3470a920 100644 --- a/package.json +++ b/package.json @@ -118,9 +118,9 @@ "dexie-export-import": "^4.1.4", "hangul-js": "^0.2.6", "kanji-processor": "^1.0.2", + "linkedom": "^0.18.10", "parse5": "^7.2.1", - "yomitan-handlebars": "git+https://github.com/yomidevs/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25", - "linkedom": "^0.18.10" + "yomitan-handlebars": "git+https://github.com/yomidevs/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25" }, "lint-staged": { "*.md": "prettier --write" diff --git a/test/data/dictionaries/invalid-dictionary10/index.json b/test/data/dictionaries/invalid-dictionary10/index.json new file mode 100644 index 0000000000..0b207bfc52 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary10/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 10", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "String entries in term bank instead of arrays" +} diff --git a/test/data/dictionaries/invalid-dictionary10/term_bank_1.json b/test/data/dictionaries/invalid-dictionary10/term_bank_1.json new file mode 100644 index 0000000000..9c30a7479a --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary10/term_bank_1.json @@ -0,0 +1 @@ +["hello", "world"] diff --git a/test/data/dictionaries/invalid-dictionary11/index.json b/test/data/dictionaries/invalid-dictionary11/index.json new file mode 100644 index 0000000000..6a01766b05 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary11/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 11", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "Null entries in term bank" +} diff --git a/test/data/dictionaries/invalid-dictionary11/term_bank_1.json b/test/data/dictionaries/invalid-dictionary11/term_bank_1.json new file mode 100644 index 0000000000..26d218ea60 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary11/term_bank_1.json @@ -0,0 +1 @@ +[null, null] diff --git a/test/data/dictionaries/invalid-dictionary12/index.json b/test/data/dictionaries/invalid-dictionary12/index.json new file mode 100644 index 0000000000..349b3d8666 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary12/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 12", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "Mixed valid array entry followed by invalid string entry in term bank" +} diff --git a/test/data/dictionaries/invalid-dictionary12/term_bank_1.json b/test/data/dictionaries/invalid-dictionary12/term_bank_1.json new file mode 100644 index 0000000000..a7074c695e --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary12/term_bank_1.json @@ -0,0 +1 @@ +[["打", "だ", "", "", 1, "", 1, ""], "invalid"] diff --git a/test/data/dictionaries/invalid-dictionary7/index.json b/test/data/dictionaries/invalid-dictionary7/index.json new file mode 100644 index 0000000000..5c15e6b99e --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary7/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 7", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "Non-array entries in term bank (numbers)" +} diff --git a/test/data/dictionaries/invalid-dictionary7/term_bank_1.json b/test/data/dictionaries/invalid-dictionary7/term_bank_1.json new file mode 100644 index 0000000000..b5d8bb58d9 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary7/term_bank_1.json @@ -0,0 +1 @@ +[1, 2, 3] diff --git a/test/data/dictionaries/invalid-dictionary8/index.json b/test/data/dictionaries/invalid-dictionary8/index.json new file mode 100644 index 0000000000..7252d2bf78 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary8/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 8", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "Boolean entries in term bank instead of arrays" +} diff --git a/test/data/dictionaries/invalid-dictionary8/term_bank_1.json b/test/data/dictionaries/invalid-dictionary8/term_bank_1.json new file mode 100644 index 0000000000..094967631e --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary8/term_bank_1.json @@ -0,0 +1 @@ +[true, false] diff --git a/test/data/dictionaries/invalid-dictionary9/index.json b/test/data/dictionaries/invalid-dictionary9/index.json new file mode 100644 index 0000000000..8a617593d4 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary9/index.json @@ -0,0 +1,7 @@ +{ + "title": "Invalid Dictionary 9", + "format": 3, + "revision": "test", + "sequenced": true, + "description": "Object entries in term bank instead of arrays" +} diff --git a/test/data/dictionaries/invalid-dictionary9/term_bank_1.json b/test/data/dictionaries/invalid-dictionary9/term_bank_1.json new file mode 100644 index 0000000000..a936aea5f6 --- /dev/null +++ b/test/data/dictionaries/invalid-dictionary9/term_bank_1.json @@ -0,0 +1 @@ +[{"key": "val"}] diff --git a/test/data/json.json b/test/data/json.json index 01925f9e09..84f7d1aea4 100644 --- a/test/data/json.json +++ b/test/data/json.json @@ -22,6 +22,18 @@ {"path": "test/data/dictionaries/invalid-dictionary5/index.json", "ignore": true}, {"path": "test/data/dictionaries/invalid-dictionary6/term_meta_bank_1.json", "ignore": true}, {"path": "test/data/dictionaries/invalid-dictionary6/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary7/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary7/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary8/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary8/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary9/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary9/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary10/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary10/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary11/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary11/index.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary12/term_bank_1.json", "ignore": true}, + {"path": "test/data/dictionaries/invalid-dictionary12/index.json", "ignore": true}, {"path": "test/jsconfig.json", "ignore": true}, {"path": "test/data/vitest.write.config.json", "ignore": true}, {"path": "test/data/vitest.options.config.json", "ignore": true}, diff --git a/test/database.test.js b/test/database.test.js index bcb5181cd1..c4c865ebe5 100644 --- a/test/database.test.js +++ b/test/database.test.js @@ -16,6 +16,7 @@ * along with this program. If not, see . */ +import {BlobWriter, TextReader, ZipWriter} from '@zip.js/zip.js'; import {IDBFactory, IDBKeyRange} from 'fake-indexeddb'; import {readFileSync} from 'node:fs'; import {fileURLToPath} from 'node:url'; @@ -43,6 +44,22 @@ async function createTestDictionaryArchiveData(dictionary, dictionaryName) { return await createDictionaryArchiveData(dictionaryDirectory, dictionaryName); } +/** + * Creates a dictionary zip archive with raw file contents, bypassing JSON parse/re-serialize. + * This allows testing with intentionally malformed JSON that parseJson would reject. + * @param {Record} files Map of filename to raw string content + * @returns {Promise} + */ +async function createRawDictionaryArchiveData(files) { + const zipFileWriter = new BlobWriter(); + const zipWriter = new ZipWriter(zipFileWriter, {level: 0}); + for (const [fileName, content] of Object.entries(files)) { + await zipWriter.add(fileName, new TextReader(content)); + } + const blob = await zipWriter.close(); + return await blob.arrayBuffer(); +} + /** * @param {import('vitest').ExpectStatic} expect * @param {import('dictionary-importer').OnProgressCallback} [onProgress] @@ -158,6 +175,12 @@ describe('Database', () => { {name: 'invalid-dictionary4'}, {name: 'invalid-dictionary5'}, {name: 'invalid-dictionary6'}, + {name: 'invalid-dictionary7'}, + {name: 'invalid-dictionary8'}, + {name: 'invalid-dictionary9'}, + {name: 'invalid-dictionary10'}, + {name: 'invalid-dictionary11'}, + {name: 'invalid-dictionary12'}, ]; describe.each(invalidDictionaries)('Invalid dictionary: $name', ({name}) => { test('Has invalid data', async ({expect}) => { @@ -173,6 +196,41 @@ describe('Database', () => { }); }); }); + describe('Invalid raw dictionaries', () => { + const indexJson = JSON.stringify({title: 'Raw Test', format: 3, revision: 'test', sequenced: true}); + const validEntry = '["打","だ","n","n",1,["definition"],1,""]'; + const rawInvalidDictionaries = [ + {name: 'missing comma between entries', termBank: `[${validEntry}${validEntry}]`}, + {name: 'leading comma', termBank: `[,${validEntry}]`}, + {name: 'double comma', termBank: `[${validEntry},,${validEntry}]`}, + {name: 'trailing garbage after array', termBank: `[${validEntry}]garbage`}, + {name: 'leading garbage before array', termBank: `garbage[${validEntry}]`}, + {name: 'concatenated arrays', termBank: `[${validEntry}][${validEntry}]`}, + {name: 'empty file', termBank: ''}, + {name: 'whitespace only', termBank: ' '}, + {name: 'just a number', termBank: '123'}, + {name: 'just a string', termBank: '"hello"'}, + {name: 'just null', termBank: 'null'}, + {name: 'unclosed array', termBank: `[${validEntry}`}, + {name: 'unclosed entry', termBank: '[["a","b"'}, + ]; + describe.each(rawInvalidDictionaries)('Raw invalid: $name', ({termBank}) => { + test('Has invalid data', async ({expect}) => { + const dictionaryDatabase = new DictionaryDatabase(); + await dictionaryDatabase.prepare(); + + const testDictionarySource = await createRawDictionaryArchiveData({ + 'index.json': indexJson, + 'term_bank_1.json': termBank, + }); + + /** @type {import('dictionary-importer').ImportDetails} */ + const importDetails = {prefixWildcardsSupported: false, yomitanVersion: '0.0.0.0'}; + await expect.soft(createDictionaryImporter(expect).importDictionary(dictionaryDatabase, testDictionarySource, importDetails)).rejects.toThrow('Dictionary has invalid data'); + await dictionaryDatabase.close(); + }); + }); + }); describe('Database valid usage', () => { const testDataFilePath = join(dirname, 'data/database-test-cases.json'); /** @type {import('test/database').DatabaseTestData} */