iReg小程序-提取PDF书签对应页面

文摘   2025-01-03 07:00   上海  

前几天有群友提出能否通过自动化实现提取PDF书签对应页面的功能,例如一个文件有100页,其中30页有书签且不规律地分布在整个文件中,如何自动化提取这30页,并且保留原有的书签结构:

小格元旦假期时间用Python写了一个小程序分享给大家,小程序界面非常简洁,可以处理单个PDF,也可以批量处理指定文件夹及其子文件夹下所有PDF,提取后的PDF文件和原文件在同一目录下,文件名添加后缀“_提取书签对应页面”:

需要的小伙伴们就扫码下载吧,压缩包里包含小程序、测试文件和使用说明:

https://pan.baidu.com/s/1Ti0AgHTieWvRnTukUq_nJA?pwd=iReg

同时把Python源代码分享如下,欢迎大家提出宝贵意见,以便不断改进:

import tkinterimport tkinter.filedialogimport tkinter.messageboximport osimport PyPDF2
def extract_bookmark_pages(input_path):
# 打开PDF文件 with open(input_path, 'rb') as file: reader = PyPDF2.PdfReader(file) writer = PyPDF2.PdfWriter() # 定义递归获取书签页面的函数 def get_pages(outlines): bm_pages = [] for outline in outlines: if type(outline) != list: # outline不是列表,即属于一级书签的情况,将书签的目标页码写入列表bm_pages dest_page = reader.get_destination_page_number(outline) if dest_page not in bm_pages: bm_pages.append(dest_page) else: # outline为二级书签组成的列表,则递归调用get_pages函数 bm_pages.extend(get_pages(outline))
return bm_pages # 将提取的页面添加到writer中 input_pdf_bm_pages = get_pages(reader.outline) page_map = {original: new for new, original in enumerate(input_pdf_bm_pages)} # 创建原始页码到新页码的映射 for page_num in input_pdf_bm_pages: writer.add_page(reader.pages[page_num]) # 定义递归复制书签的函数 def add_bookmarks_recursively(outlines, parent=None, page_map=None): for outline in outlines: if type(outline) != list: # outline不是列表,即属于一级书签的情况,则在writer中创建该书签 new_parent = writer.add_outline_item(outline.title, page_map[reader.get_destination_page_number(outline)], parent) if page_map else None else: # outline为二级书签组成的列表,则递归调用add_bookmarks_recursively函数 add_bookmarks_recursively(outline, new_parent, page_map) # 创建新PDF文件的书签 add_bookmarks_recursively(reader.outline, page_map=page_map) # 写入输出PDF output_path = os.path.splitext(input_path)[0] + '_提取书签对应页面.pdf' writer.set_page_mode('/UseOutlines') # 设置“初始视图 - 导览标签”为“书签面板和页面” with open(output_path, 'wb') as output_file: writer.write(output_file)
class Window(): def __init__(self): self.root = root = tkinter.Tk()
self.label = tkinter.Label(root, text='选择文件') self.label.grid(row=0, column=0) self.entryfile_name = tkinter.Entry(root) self.entryfile_name.grid(row=0, column=1) self.BrowserDirButton = tkinter.Button(root, text='浏览', command=self.select_file) self.BrowserDirButton.grid(row=0, column=2) self.ButtonCov = tkinter.Button(root, text='开始处理', command=self.Conv1, ) self.ButtonCov.grid(row=0, column=3)
self.label = tkinter.Label(root, text='或者') self.label.grid(row=1, column=0)
self.label = tkinter.Label(root, text='选择目录') self.label.grid(row=2, column=0) self.entryDir = tkinter.Entry(root) self.entryDir.grid(row=2, column=1) self.BrowserDirButton = tkinter.Button(root, text='浏览', command=self.BrowserDir) self.BrowserDirButton.grid(row=2, column=2) self.ButtonCov = tkinter.Button(root, text='批量处理', command=self.Conv2, ) self.ButtonCov.grid(row=2, column=3)
def select_file(self): # 弹出选择文件对话框 file_name = tkinter.filedialog.askopenfilename() if file_name: self.entryfile_name.delete(0, tkinter.END) self.entryfile_name.insert(tkinter.END, file_name)
def Conv1(self): file_name = self.entryfile_name.get() if file_name.lower().endswith('.pdf'): extract_bookmark_pages(file_name) tkinter.messagebox.showinfo("iReg", "已完成") else: tkinter.messagebox.showinfo("iReg", "请选择PDF文件")
def BrowserDir(self): directory = tkinter.filedialog.askdirectory(title='iReg小程序') if directory: self.entryDir.delete(0, tkinter.END) self.entryDir.insert(tkinter.END, directory)
def Conv2(self): for root, dirs, files in os.walk(self.entryDir.get()): for file in files: try: if file.lower().endswith('.pdf'): file_path = os.path.join(root, file) extract_bookmark_pages(file_path) except: pass
tkinter.messagebox.showinfo("iReg", "已完成")
def mainloop(self): self.root.minsize(380, 120) self.root.maxsize(380, 120) self.root.title('iReg小程序 - 提取PDF书签对应页面') self.root.mainloop()
if __name__ == "__main__": window = Window() window.mainloop()

iReg
iReg,爱注册——交流药品注册法规,分享药品注册经验
 最新文章