diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..01fc42f
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..f1b2de2
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/pdf2image.iml b/.idea/pdf2image.iml
new file mode 100644
index 0000000..8b8c395
--- /dev/null
+++ b/.idea/pdf2image.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/output_images/page_1.png b/output_images/page_1.png
new file mode 100644
index 0000000..1c9609a
Binary files /dev/null and b/output_images/page_1.png differ
diff --git a/output_images/page_3.png b/output_images/page_3.png
new file mode 100644
index 0000000..292fbca
Binary files /dev/null and b/output_images/page_3.png differ
diff --git a/output_images/page_5.png b/output_images/page_5.png
new file mode 100644
index 0000000..2f94ad2
Binary files /dev/null and b/output_images/page_5.png differ
diff --git a/pdf2image/pdf2image.py b/pdf2image/pdf2image.py
index 21d19c8..40ed7aa 100644
--- a/pdf2image/pdf2image.py
+++ b/pdf2image/pdf2image.py
@@ -35,86 +35,58 @@
def convert_from_path(
- pdf_path: Union[str, PurePath],
- dpi: int = 200,
- output_folder: Union[str, PurePath] = None,
- first_page: int = None,
- last_page: int = None,
- fmt: str = "ppm",
- jpegopt: Dict = None,
- thread_count: int = 1,
- userpw: str = None,
- ownerpw: str = None,
- use_cropbox: bool = False,
- strict: bool = False,
- transparent: bool = False,
- single_file: bool = False,
- output_file: Any = uuid_generator(),
- poppler_path: Union[str, PurePath] = None,
- grayscale: bool = False,
- size: Union[Tuple, int] = None,
- paths_only: bool = False,
- use_pdftocairo: bool = False,
- timeout: int = None,
- hide_annotations: bool = False,
+ pdf_path: Union[str, PurePath],
+ dpi: int = 200,
+ output_folder: Union[str, PurePath] = None,
+ first_page: int = None,
+ last_page: int = None,
+ fmt: str = "ppm",
+ jpegopt: Dict = None,
+ thread_count: int = 1,
+ userpw: str = None,
+ ownerpw: str = None,
+ use_cropbox: bool = False,
+ strict: bool = False,
+ transparent: bool = False,
+ single_file: bool = False,
+ output_file: Any = uuid_generator(),
+ poppler_path: Union[str, PurePath] = None,
+ grayscale: bool = False,
+ size: Union[Tuple, int] = None,
+ paths_only: bool = False,
+ use_pdftocairo: bool = False,
+ timeout: int = None,
+ hide_annotations: bool = False,
+ page_indexes: List[int] = None # 允许指定要转换的特定页面
) -> List[Image.Image]:
- """Function wrapping pdftoppm and pdftocairo
+ """Function wrapping pdftoppm and pdftocairo, allowing page selection by indexes."""
- :param pdf_path: Path to the PDF that you want to convert
- :type pdf_path: Union[str, PurePath]
- :param dpi: Image quality in DPI (default 200), defaults to 200
- :type dpi: int, optional
- :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
- :type output_folder: Union[str, PurePath], optional
- :param first_page: First page to process, defaults to None
- :type first_page: int, optional
- :param last_page: Last page to process before stopping, defaults to None
- :type last_page: int, optional
- :param fmt: Output image format, defaults to "ppm"
- :type fmt: str, optional
- :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
- :type jpegopt: Dict, optional
- :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
- :type thread_count: int, optional
- :param userpw: PDF's password, defaults to None
- :type userpw: str, optional
- :param ownerpw: PDF's owner password, defaults to None
- :type ownerpw: str, optional
- :param use_cropbox: Use cropbox instead of mediabox, defaults to False
- :type use_cropbox: bool, optional
- :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
- :type strict: bool, optional
- :param transparent: Output with a transparent background instead of a white one, defaults to False
- :type transparent: bool, optional
- :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
- :type single_file: bool, optional
- :param output_file: What is the output filename or generator, defaults to uuid_generator()
- :type output_file: Any, optional
- :param poppler_path: Path to look for poppler binaries, defaults to None
- :type poppler_path: Union[str, PurePath], optional
- :param grayscale: Output grayscale image(s), defaults to False
- :type grayscale: bool, optional
- :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
- :type size: Union[Tuple, int], optional
- :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
- :type paths_only: bool, optional
- :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
- :type use_pdftocairo: bool, optional
- :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
- :type timeout: int, optional
- :param hide_annotations: Hide PDF annotations in the output, defaults to False
- :type hide_annotations: bool, optional
- :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
- :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
- :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
- :return: A list of Pillow images, one for each page between first_page and last_page
- :rtype: List[Image.Image]
- """
+ # 如果指定了 page_indexes 参数,我们需要重新调整 first_page 和 last_page
+ if page_indexes is not None:
+ page_indexes = sorted(set(page_indexes)) # 排序并去重
+ first_page = page_indexes[0]
+ last_page = page_indexes[-1]
+
+ # 验证页面范围
+ if first_page is None or first_page < 1:
+ first_page = 1
+ # 获取总页数
+ page_count = pdfinfo_from_path(
+ pdf_path, userpw, ownerpw, poppler_path=poppler_path
+ )["Pages"]
+
+ if last_page is None or last_page > page_count:
+ last_page = page_count
+
+ if first_page > last_page:
+ return []
+
+ # 确定使用的转换工具
if use_pdftocairo and fmt == "ppm":
fmt = "png"
- # We make sure that if passed arguments are Path objects, they're converted to strings
+ # 如果传递的是 PurePath 对象,转换为字符串
if isinstance(pdf_path, PurePath):
pdf_path = pdf_path.as_posix()
@@ -124,53 +96,22 @@ def convert_from_path(
if isinstance(poppler_path, PurePath):
poppler_path = poppler_path.as_posix()
- page_count = pdfinfo_from_path(
- pdf_path, userpw, ownerpw, poppler_path=poppler_path
- )["Pages"]
-
- # We start by getting the output format, the buffer processing function and if we need pdftocairo
parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
fmt, grayscale
)
- # We use pdftocairo is the format requires it OR we need a transparent output
use_pdfcairo = (
- use_pdftocairo
- or use_pdfcairo_format
- or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
- )
-
- poppler_version_major, poppler_version_minor = _get_poppler_version(
- "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
+ use_pdftocairo
+ or use_pdfcairo_format
+ or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
)
- if poppler_version_major == 0 and poppler_version_minor <= 57:
- jpegopt = None
-
- if poppler_version_major == 0 and poppler_version_minor <= 83:
- hide_annotations = False
-
- # If output_file isn't a generator, it will be turned into one
- if not isinstance(output_file, types.GeneratorType) and not isinstance(
- output_file, ThreadSafeGenerator
- ):
- if single_file:
- output_file = iter([output_file])
- thread_count = 1
- else:
- output_file = counter_generator(output_file)
-
- if thread_count < 1:
- thread_count = 1
-
- if first_page is None or first_page < 1:
- first_page = 1
-
- if last_page is None or last_page > page_count:
- last_page = page_count
+ # 如果线程数超过页面数,则限制线程数
+ if thread_count > len(page_indexes) if page_indexes else (last_page - first_page + 1):
+ thread_count = len(page_indexes) if page_indexes else (last_page - first_page + 1)
- if first_page > last_page:
- return []
+ processes = []
+ images = []
try:
auto_temp_dir = False
@@ -178,26 +119,28 @@ def convert_from_path(
output_folder = tempfile.mkdtemp()
auto_temp_dir = True
- # Recalculate page count based on first and last page
- page_count = last_page - first_page + 1
+ # 多线程处理
+ for i in range(thread_count):
+ thread_output_file = next(output_file)
- if thread_count > page_count:
- thread_count = page_count
+ # 获取该线程应处理的页面范围
+ if page_indexes is not None:
+ pages_to_process = page_indexes[i::thread_count] # 均匀分配页面到每个线程
+ else:
+ pages_to_process = list(range(first_page + i, last_page + 1, thread_count))
- reminder = page_count % thread_count
- current_page = first_page
- processes = []
- for _ in range(thread_count):
- thread_output_file = next(output_file)
+ if not pages_to_process:
+ continue
- # Get the number of pages the thread will be processing
- thread_page_count = page_count // thread_count + int(reminder > 0)
- # Build the command accordingly
+ first_page_in_thread = pages_to_process[0]
+ last_page_in_thread = pages_to_process[-1]
+
+ # 构建命令
args = _build_command(
["-r", str(dpi), pdf_path],
output_folder,
- current_page,
- current_page + thread_page_count - 1,
+ first_page_in_thread,
+ last_page_in_thread,
parsed_fmt,
jpegopt,
thread_output_file,
@@ -220,21 +163,18 @@ def convert_from_path(
else:
args = [_get_command_path("pdftoppm", poppler_path)] + args
- # Update page values
- current_page = current_page + thread_page_count
- reminder -= int(reminder > 0)
# Add poppler path to LD_LIBRARY_PATH
env = os.environ.copy()
if poppler_path is not None:
env["LD_LIBRARY_PATH"] = (
- poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+ poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
)
- # Spawn the process and save its uuid
startupinfo = None
if platform.system() == "Windows":
- # this startupinfo structure prevents a console window from popping up on Windows
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+
+ # 启动进程
processes.append(
(
thread_output_file,
@@ -244,8 +184,7 @@ def convert_from_path(
)
)
- images = []
-
+ # 等待所有进程完成并收集结果
for uid, proc in processes:
try:
data, err = proc.communicate(timeout=timeout)
diff --git a/specific_page_conversion_example.py b/specific_page_conversion_example.py
new file mode 100644
index 0000000..3640ffb
--- /dev/null
+++ b/specific_page_conversion_example.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+import os
+
+from pdf2image import convert_from_path
+
+# PDF 文件的路径
+pdf_path = "D:\\kaiyuan\\pdf2image\\testpdf.pdf"
+
+page_indexes = [1, 3, 5]
+
+output_folder = "D:\\kaiyuan\\pdf2image\\output_images"
+Path(output_folder).mkdir(parents=True, exist_ok=True)
+
+# 调用 convert_from_path 函数,将所有页面转换为图片
+images = convert_from_path(
+ pdf_path=pdf_path,
+ dpi=300, # 设置 DPI 分辨率
+ output_folder=None, # 不保存到文件夹,直接在内存中处理
+ fmt="png", # 输出格式为 PNG
+)
+
+for idx, img in enumerate(images):
+ if (idx + 1) in page_indexes:
+ img.save(os.path.join(output_folder, f"page_{idx + 1}.png"))
+ img.close()
+
+print(f"所有指定的图片已成功保存到 {output_folder}")
diff --git a/testpdf.pdf b/testpdf.pdf
new file mode 100644
index 0000000..dd9dd1a
Binary files /dev/null and b/testpdf.pdf differ