前几天有群友提出能否通过自动化实现提取PDF书签对应页面的功能,例如一个文件有100页,其中30页有书签且不规律地分布在整个文件中,如何自动化提取这30页,并且保留原有的书签结构:
小格元旦假期时间用Python写了一个小程序分享给大家,小程序界面非常简洁,可以处理单个PDF,也可以批量处理指定文件夹及其子文件夹下所有PDF,提取后的PDF文件和原文件在同一目录下,文件名添加后缀“_提取书签对应页面”:
https://pan.baidu.com/s/1Ti0AgHTieWvRnTukUq_nJA?pwd=iReg
同时把Python源代码分享如下,欢迎大家提出宝贵意见,以便不断改进:
import tkinter
import tkinter.filedialog
import tkinter.messagebox
import os
import PyPDF2
def extract_bookmark_pages(input_path):
# 打开PDF文件
with open(input_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
# 定义递归获取书签页面的函数
def get_pages(outlines):
bm_pages = []
for outline in outlines:
if type(outline) != list: # outline不是列表,即属于一级书签的情况,将书签的目标页码写入列表bm_pages
dest_page = reader.get_destination_page_number(outline)
if dest_page not in bm_pages:
bm_pages.append(dest_page)
else: # outline为二级书签组成的列表,则递归调用get_pages函数
bm_pages.extend(get_pages(outline))
return bm_pages
# 将提取的页面添加到writer中
input_pdf_bm_pages = get_pages(reader.outline)
page_map = {original: new for new, original in enumerate(input_pdf_bm_pages)} # 创建原始页码到新页码的映射
for page_num in input_pdf_bm_pages:
writer.add_page(reader.pages[page_num])
# 定义递归复制书签的函数
def add_bookmarks_recursively(outlines, parent=None, page_map=None):
for outline in outlines:
if type(outline) != list: # outline不是列表,即属于一级书签的情况,则在writer中创建该书签
new_parent = writer.add_outline_item(outline.title, page_map[reader.get_destination_page_number(outline)], parent) if page_map else None
else: # outline为二级书签组成的列表,则递归调用add_bookmarks_recursively函数
add_bookmarks_recursively(outline, new_parent, page_map)
# 创建新PDF文件的书签
add_bookmarks_recursively(reader.outline, page_map=page_map)
# 写入输出PDF
output_path = os.path.splitext(input_path)[0] + '_提取书签对应页面.pdf'
writer.set_page_mode('/UseOutlines') # 设置“初始视图 - 导览标签”为“书签面板和页面”
with open(output_path, 'wb') as output_file:
writer.write(output_file)
class Window():
def __init__(self):
self.root = root = tkinter.Tk()
self.label = tkinter.Label(root, text='选择文件')
self.label.grid(row=0, column=0)
self.entryfile_name = tkinter.Entry(root)
self.entryfile_name.grid(row=0, column=1)
self.BrowserDirButton = tkinter.Button(root, text='浏览', command=self.select_file)
self.BrowserDirButton.grid(row=0, column=2)
self.ButtonCov = tkinter.Button(root, text='开始处理', command=self.Conv1, )
self.ButtonCov.grid(row=0, column=3)
self.label = tkinter.Label(root, text='或者')
self.label.grid(row=1, column=0)
self.label = tkinter.Label(root, text='选择目录')
self.label.grid(row=2, column=0)
self.entryDir = tkinter.Entry(root)
self.entryDir.grid(row=2, column=1)
self.BrowserDirButton = tkinter.Button(root, text='浏览', command=self.BrowserDir)
self.BrowserDirButton.grid(row=2, column=2)
self.ButtonCov = tkinter.Button(root, text='批量处理', command=self.Conv2, )
self.ButtonCov.grid(row=2, column=3)
def select_file(self):
# 弹出选择文件对话框
file_name = tkinter.filedialog.askopenfilename()
if file_name:
self.entryfile_name.delete(0, tkinter.END)
self.entryfile_name.insert(tkinter.END, file_name)
def Conv1(self):
file_name = self.entryfile_name.get()
if file_name.lower().endswith('.pdf'):
extract_bookmark_pages(file_name)
tkinter.messagebox.showinfo("iReg", "已完成")
else:
tkinter.messagebox.showinfo("iReg", "请选择PDF文件")
def BrowserDir(self):
directory = tkinter.filedialog.askdirectory(title='iReg小程序')
if directory:
self.entryDir.delete(0, tkinter.END)
self.entryDir.insert(tkinter.END, directory)
def Conv2(self):
for root, dirs, files in os.walk(self.entryDir.get()):
for file in files:
try:
if file.lower().endswith('.pdf'):
file_path = os.path.join(root, file)
extract_bookmark_pages(file_path)
except:
pass
tkinter.messagebox.showinfo("iReg", "已完成")
def mainloop(self):
self.root.minsize(380, 120)
self.root.maxsize(380, 120)
self.root.title('iReg小程序 - 提取PDF书签对应页面')
self.root.mainloop()
if __name__ == "__main__":
window = Window()
window.mainloop()