From 3ca7c004504990e753d90115dba37f617230f417 Mon Sep 17 00:00:00 2001
From: Pierre Leroux
Date: Wed, 20 Nov 2024 16:45:34 +0100
Subject: [PATCH] feat: mupdfjs with wasm build to extract metadata and cover
img
---
package-lock.json | 6 +++++
package.json | 1 +
src/main/pdf/extract.ts | 50 +++++++++++++++++++++++++++++++++++-
src/main/pdf/extract.type.ts | 8 +++---
tsconfig.json | 2 +-
webpack.config.main.js | 2 +-
6 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/package-lock.json b/package-lock.json
index 7b5b40f0b5..5d5d5bc242 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -37,6 +37,7 @@
"match-sorter": "^8.0.0",
"mathjax": "^3.2.2",
"mime-types": "^2.1.35",
+ "mupdf": "github:edrlab/mupdf.js",
"nanoid": "^5.0.8",
"node-fetch": "^3.3.2",
"proxy-agent": "^6.4.0",
@@ -19736,6 +19737,11 @@
"multicast-dns": "cli.js"
}
},
+ "node_modules/mupdf": {
+ "version": "1.0.0",
+ "resolved": "git+ssh://git@github.com/edrlab/mupdf.js.git#928cad2fcf7db5e3bd46070f140a2713f873bb86",
+ "license": "AGPL-3.0-or-later"
+ },
"node_modules/nanoid": {
"version": "5.0.8",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-5.0.8.tgz",
diff --git a/package.json b/package.json
index 3cf7c51d3c..39b2171ca2 100644
--- a/package.json
+++ b/package.json
@@ -291,6 +291,7 @@
"match-sorter": "^8.0.0",
"mathjax": "^3.2.2",
"mime-types": "^2.1.35",
+ "mupdf": "github:edrlab/mupdf.js",
"nanoid": "^5.0.8",
"node-fetch": "^3.3.2",
"proxy-agent": "^6.4.0",
diff --git a/src/main/pdf/extract.ts b/src/main/pdf/extract.ts
index 2fc18805d2..24715e9c97 100644
--- a/src/main/pdf/extract.ts
+++ b/src/main/pdf/extract.ts
@@ -13,11 +13,59 @@ import { encodeURIComponent_RFC3986 } from "@r2-utils-js/_utils/http/UrlUtils";
import { IInfo } from "./extract.type";
+import { readFile } from "node:fs/promises";
+
+import * as mupdfjs from "mupdf";
+
const debug = debug_("readium-desktop:main/pdf/extract/index.ts");
debug("_");
type TExtractPdfData = [data: IInfo | undefined, coverPNG: Buffer | undefined];
-export const extractPDFData =
+
+export const extractPDFData = async (pdfPath: string): Promise => {
+
+ try {
+ const pdfBuffer = await readFile(pdfPath);
+
+ const doc = mupdfjs.PDFDocument.openDocument(pdfBuffer, "application/pdf");
+
+ const info: IInfo = {
+ Title: doc.getMetaData("info:Title"),
+ Subject: doc.getMetaData("info:Subject"),
+ Keywords: doc.getMetaData("info:Keywords"),
+ Author: doc.getMetaData("info:Author"),
+ Creator: doc.getMetaData("info:Creator"),
+ Producer: doc.getMetaData("info:Producer"),
+ CreationDate: doc.getMetaData("info:CreationDate"),
+ ModDate: doc.getMetaData("info:ModDate"),
+ numberOfPages: doc.countPages(),
+ };
+
+ const page = new mupdfjs.PDFPage(doc, 0);
+
+ const pixmap = page.toPixmap(mupdfjs.Matrix.identity, mupdfjs.ColorSpace.DeviceRGB, false, true);
+ const pngImage = pixmap.asPNG();
+ const img = Buffer.alloc(pngImage.byteLength);
+ for (let i = 0; i < img.length; ++i) {
+ img[i] = pngImage[i];
+ }
+
+ return [info, img];
+
+
+ } catch (e) {
+
+ debug("####");
+ debug("####");
+ debug(e);
+ debug("####");
+ debug("####");
+ }
+
+ return [undefined, undefined];
+};
+
+export const extractPDFDataPdfjs =
async (pdfPath: string)
: Promise => {
diff --git a/src/main/pdf/extract.type.ts b/src/main/pdf/extract.type.ts
index f5cc73f683..cdf856a6ed 100644
--- a/src/main/pdf/extract.type.ts
+++ b/src/main/pdf/extract.type.ts
@@ -7,10 +7,10 @@
export interface IInfo {
PDFFormatVersion?: string;
- IsAcroFormPresent?: boolean;
- IsCollectionPresent?: boolean;
- IsLinearized?: boolean;
- IsXFAPresent?: boolean;
+ // IsAcroFormPresent?: boolean;
+ // IsCollectionPresent?: boolean;
+ // IsLinearized?: boolean;
+ // IsXFAPresent?: boolean;
Title?: string;
Subject?: string;
Keywords?: string;
diff --git a/tsconfig.json b/tsconfig.json
index 90206474a7..1f75d587cc 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -44,7 +44,7 @@
"removeComments": true,
"skipLibCheck": false,
"module": "ES2020",
- "moduleResolution": "Node",
+ "moduleResolution": "node10",
"lib": [
"es2020",
"dom",
diff --git a/webpack.config.main.js b/webpack.config.main.js
index 4402d982ca..ad221403d1 100644
--- a/webpack.config.main.js
+++ b/webpack.config.main.js
@@ -45,7 +45,7 @@ const _externalsCache = new Set();
if (nodeEnv !== "production") {
const nodeExternals = require("webpack-node-externals");
const neFunc = nodeExternals({
- allowlist: ["timeout-signal", "nanoid", "normalize-url", "node-fetch", "data-uri-to-buffer", /^fetch-blob/, /^formdata-polyfill/],
+ allowlist: ["timeout-signal", "nanoid", "normalize-url", "node-fetch", "mupdf", "data-uri-to-buffer", /^fetch-blob/, /^formdata-polyfill/],
importType: function (moduleName) {
if (!_externalsCache.has(moduleName)) {
console.log(`WEBPACK EXTERNAL (MAIN): [${moduleName}]`);