PDF查询脚本
使用场景为PDF有上千页,如果使用Wps去搜索关键字需要搜索很久并且一次只能搜索单一关键字,这样就导致需要大量时间去搜索所以有了这个脚本
该脚本使用Python编写 实现如下功能:
1:可以对PDF进行多线程搜索)
2:可以指定需要搜索的关键词和搜索的起始页和终止页
3:关键词支持正则表达式
4:将搜索结果输出在txt文本中包含关键词所在的页数以及页码有多少个关键词
import re
import logging
import argparse
from concurrent.futures import ThreadPoolExecutor
from PyPDF2 import PdfReader
def search_page(page_num, text, patterns, file_logger):
"""
在指定的页中搜索多个正则表达式模式,
搜索完成后记录日志。如果匹配到则返回页码和匹配到的关键词列表。
"""
matches = {}
for pattern in patterns:
found = re.findall(pattern, text)
if found:
matches[pattern] = found
# 输出搜索结果
if matches:
logging.info(f"Page {page_num}: found {matches}")
file_logger.info(f"Page {page_num}: found {matches}")
else:
logging.info(f"Page {page_num}: no matches")
# 无论是否匹配,都记录搜索完成
logging.info(f"Page {page_num} search complete.")
return (page_num, matches)
def main():
# 配置命令行参数:--max-threads 用于指定线程数
parser = argparse.ArgumentParser(description="多线程PDF搜索工具")
parser.add_argument('--max-threads', type=int, default=16,
help="指定线程池中线程数量,默认为16")
args = parser.parse_args()
# 配置控制台日志
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
# 配置文件日志处理器,仅记录匹配结果
file_handler = logging.FileHandler("search_results.txt", mode="w", encoding="utf-8")
file_handler.setLevel(logging.INFO)
file_formatter = logging.Formatter('%(asctime)s - %(message)s')
file_handler.setFormatter(file_formatter)
# 创建一个专门用于文件输出的logger
file_logger = logging.getLogger('file_logger')
file_logger.addHandler(file_handler)
file_logger.setLevel(logging.INFO)
pdf_path = r"C:\Users\xiaoqi\Desktop\东莞\pingyin-2025318\王雄鑫_1741767720\王雄鑫_取证分析报告.pdf" # 替换为实际PDF路径
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
# 指定搜索的页码范围(285 到 62874页)
start_page = 285
end_page = 62874
if end_page > total_pages:
end_page = total_pages
logging.info(f"实际PDF总页数为 {total_pages},调整搜索结束页为 {total_pages}")
# 定义需要搜索的正则表达式模式
patterns = [r"1759451979",r"1600447240",r"1606368439",r"744515677",r"1940126284",r"1826651681",r"1512955818",r"1398621025",r"296510430",r"602816590",r"1054632180",r"1148498244",r"1707546403",r"1258714239"
,r"1464245278",r"1805230931",r"1432671151",r"546487116",r"1922002443",r"1280362250",r"^1[3-9]\d{9}$"]
results = []
with ThreadPoolExecutor(max_workers=args.max_threads) as executor:
futures = []
# 提交任务,注意页码从 start_page 到 end_page
for i in range(start_page - 1, end_page):
page = reader.pages[i]
text = page.extract_text() or ""
futures.append(executor.submit(search_page, i+1, text, patterns, file_logger))
# 按完成顺序收集结果(这里仅做汇总,不影响即时日志输出)
for future in futures:
page_num, match_dict = future.result()
if match_dict:
results.append((page_num, match_dict))
# 输出匹配结果到控制台
for page_num, match_dict in results:
print(f"第 {page_num} 页匹配到:")
for pattern, found_list in match_dict.items():
print(f" 模式 '{pattern}' 匹配结果: {found_list}")
if __name__ == '__main__':
main()
PDF查询脚本
http://1.95.139.200:8090/archives/PDFSearch