pdf文件处理
常用库
库名称 | 特点 | 备注 |
---|---|---|
pypdf | 简单纯净的pdf操作 | pdm add pypdf |
fitz(前名称:pymupdf ) | pip install pymupdf | |
COM接口调用 - comtypes | 要求系统中安装了Adobe Acrobat DC |
COM调用
获取 PDF 文件的页码
import comtypes.client
def list_pdf_page(folder: str):
"""
@Description 打印指定目录下pdf文件的页码
"""
# 创建 Adobe Acrobat 的 COM 对象
av_doc = comtypes.client.CreateObject("AcroExch.AVDoc")
app = comtypes.client.CreateObject("AcroExch.App")
pdf_list = os.listdir(folder)
for e in range(2, 25, 2):
pdf_path = path.join(folder, f"BS·22·01-009-{e}.pdf")
print("pdf_path: ", pdf_path)
if av_doc.Open(pdf_path, ""):
pd_doc = av_doc.GetPDDoc()
page_count = pd_doc.GetNumPages()
# print( page_count)
print(pdf_path, page_count)
av_doc.Close(False)
app.Exit()
导出为图片
import comtypes.client
import os
def export_pdf_to_images(pdf_path: str, output_folder: str) -> None:
"""
将 PDF 文件的每一页导出为 JPG 图片。
:param pdf_path: PDF 文件的路径
:param output_folder: 输出图片的文件夹路径
"""
try:
# 创建 Adobe Acrobat 的 COM 对象
av_doc = comtypes.client.CreateObject("AcroExch.AVDoc")
app = comtypes.client.CreateObject("AcroExch.App")
# 打开 PDF 文件
if av_doc.Open(pdf_path, ""):
pd_doc = av_doc.GetPDDoc()
page_count = pd_doc.GetNumPages()
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 导出每一页为 JPG 图片
for page_num in range(page_count):
page = pd_doc.AcquirePage(page_num)
rect = page.GetSize()
output_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
page.DrawEx(rect, 0, 0, 0, output_path, "jpg", 300, 300)
# 关闭文档
av_doc.Close(False)
app.Exit()
print(f"Exported {page_count} pages to {output_folder}")
else:
print("Failed to open PDF file.")
except Exception as e:
print(f"Error: {e}")
# 使用示例
pdf_path = "example.pdf"
output_folder = "output_images"
export_pdf_to_images(pdf_path, output_folder)
合并pdf
from pypdf import PdfWriter
import pathlib
def merge_pdfs(pdf_list, output_pdf: str = None):
merger = PdfWriter()
for pdf in pdf_list:
merger.append(pdf)
if output_pdf is None:
pdf_file = pathlib.Path(pdf_list[0])
output_pdf = str(pdf_file.resolve()).replace(pdf_file.suffix, "_merge.pdf")
merger.write(output_pdf)
merger.close()
if __name__ == "__main__":
pdf_list = [
r"Z:\work\2024\其他\加章\工作实绩-林芯伊、刘珊霞、周舜轩整理(1)\封面人员页-P章\2022年博罗水文站旱警水位-1.pdf",
r"Z:\work\2024\其他\加章\工作实绩-林芯伊、刘珊霞、周舜轩整理(1)\封面人员页-P章\2022年博罗水文站旱警水位-2.pdf",
]
merge_pdfs(pdf_list, r"Z:\work\2024\其他\加章\工作实绩-林芯伊、刘珊霞、周舜轩整理(1)\output.pdf")
导出jpg
pdf2image + Poppler
使用pdf2image
需要确保系统中已安装 Poppler:
下载地址:https://github.com/oschwartz10612/poppler-windows/releases/
from pdf2image import convert_from_path
import os
def pdf_to_jpg_with_pdf2image(pdf_path: str, output_folder: str, dpi: int = 300):
"""
使用 pdf2image 将 PDF 转换为 JPG 格式。
Args:
pdf_path (str): PDF 文件路径。
output_folder (str): 导出图片的保存文件夹。
dpi (int): 输出图片的分辨率。
Returns:
None
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 转换 PDF 每一页为图像
images = convert_from_path(pdf_path, dpi=dpi)
# 保存每一页为 JPG
for i, image in enumerate(images):
output_file = os.path.join(output_folder, f"page_{i + 1}.jpg")
image.save(output_file, "JPEG")
print(f"保存: {output_file}")
# 示例调用
pdf_to_jpg_with_pdf2image(
pdf_path="example.pdf",
output_folder="output_images_pdf2image",
dpi=300
)
指定导出A4、A3尺寸的jpg
pip install pymupdf pillow
python -m pip isntall pymupdf pillow
pdm add pymupdf pillow
def export_pdf_to_paper_size(pdf_path, output_path, dpi=300, paper_size="A4"):
# 定义A4和A3的尺寸(毫米)并转换为像素
paper_sizes = {"A4": (210, 297), "A3": (297, 420)}
width_mm, height_mm = paper_sizes.get(paper_size, ("A4"))
# 毫米转英寸再乘以DPI
target_width = int(width_mm / 25.4 * dpi)
target_height = int(height_mm / 25.4 * dpi)
# 打开PDF并获取页面
doc = fitz.open(pdf_path)
for page in doc:
original_width = page.rect.width # 原始宽度(点)
original_height = page.rect.height # 原始高度(点)
# 计算适应目标尺寸的缩放比例
scale_w = target_width / original_width
scale_h = target_height / original_height
scale = min(scale_w, scale_h) # 保持宽高比
# 生成缩放后的图像
matrix = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=matrix, dpi=dpi)
# pix.save(output_path + f"page-{page.number}.jpg", "jpg", jpg_quality=70)
img = Image.open(io.BytesIO(pix.tobytes()))
# 创建目标尺寸画布并居中粘贴
canvas = Image.new("RGB", (target_width, target_height), (255, 255, 255))
x = (target_width - img.width) // 2
y = (target_height - img.height) // 2
canvas.paste(img, (x, y))
# 保存结果
canvas.info["dpi"] = (dpi, dpi) # 写入DPI信息
canvas.save(output_path + f"page-{page.number}.jpg", "JPEG", quality=95, dpi=(dpi, dpi))
doc.close()
使用
if __name__ == "__main__":
tar = r"D:\Work\大藤峡\归档\电子版\卷001\BS·22·01-001-1.pdf"
pdf_to_jpg(tar)
修改尺寸比例(A4)
# -*- coding: utf-8 -*-
#
# @Author: CPS
# @email: 373704015@qq.com
# @Date:
# @Last Modified by: CPS
# @Last Modified time: 2025-04-10 23:25:27.768326
# @file_path "D:\CPS\MyProject\Projects_Personal\GG_wx_fapiao_download"
# @Filename "test.py"
# @Description: 修改pdf文件的尺寸为A4 置顶对齐,方便打印
#
import os, sys
sys.path.append("..")
from os import path
import fitz
def get_page_display_size(page):
"""获取考虑旋转后的页面显示尺寸"""
rotation = page.rotation
if rotation in (90, 270):
return page.rect.height, page.rect.width
return page.rect.width, page.rect.height
def is_a4_portrait(page, tolerance=5):
"""检查页面是否为A4纵向"""
a4_width, a4_height = 595, 842 # A4尺寸(单位:点)
width, height = get_page_display_size(page)
return abs(width - a4_width) <= tolerance and abs(height - a4_height) <= tolerance
def convert_pdf_to_a4_portrait(input_path, output_path, vertical_align="top"):
"""
将PDF转换为A4纵向格式,支持置顶对齐
:param input_path: 输入PDF路径
:param output_path: 输出PDF路径
:param vertical_align: 垂直对齐方式("top" 或 "center")
"""
src_doc = fitz.open(input_path)
new_doc = fitz.open() # 创建新文档
a4_width, a4_height = 595, 842
for page_num in range(len(src_doc)):
page = src_doc[page_num]
display_width, display_height = get_page_display_size(page)
if is_a4_portrait(page):
new_page = new_doc.new_page(width=a4_width, height=a4_height)
new_page.show_pdf_page(new_page.rect, src_doc, page_num)
else:
new_page = new_doc.new_page(width=a4_width, height=a4_height)
scale = min(a4_width / display_width, a4_height / display_height)
scaled_width = display_width * scale
scaled_height = display_height * scale
# 水平居中计算
x = (a4_width - scaled_width) / 2
# 垂直对齐方式判断
if vertical_align.lower() == "top":
y = 0 # 置顶对齐
else: # 默认居中
y = (a4_height - scaled_height) / 2
dest_rect = fitz.Rect(x, y, x + scaled_width, y + scaled_height)
new_page.show_pdf_page(dest_rect, src_doc, page_num)
new_doc.save(output_path)
new_doc.close()
src_doc.close()
if __name__ == "__main__":
# 使用示例
input_pdf = path.abspath(
r"D:\CPS\MyProject\Projects_Personal\GG_wx_fapiao_download\data\粉盒,硒鼓.pdf"
)
convert_pdf_to_a4_portrait(input_pdf, "output.pdf")