python利用pdfplumber进行pdf文档解析提取
目录
- pdfplumber 的特点
- 提取PDF中的图片
- 提取pdf 表格文本
- 提取PDF纯文本
- 读取富文本txt
pdfplumber 的特点
1、它是一个纯 python 第三方库,适合 python 3.x 版本
、它用来查看pdf各类信息,能有效提取文本、表格
3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理
import glob import pdfplumber import re from collections import defaultdict import json class PDFProcessor: def __init__(self, filepath): self.filepath = filepath #打开文档,注意存放的位置 self.pdf = pdfplumber.open(filepath) self.all_text = defaultdict(dict) self.allrow = 0 self.last_num = 0 def check_lines(self, page, top, buttom): """ 用于检查页面中的行,并根据给定的顶部和底部位置来合并行。 """ # 文本数据 lines = page.extract_words()[::] text = '' last_top = 0 last_check = 0 for l in range(len(lines)): each_line = lines[l] check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$' if top == '' and buttom == '': if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] #elif last_check > 0 and (page.height * 0.85 - each_linjavascripte['top']) > 0 and not re.search(check_re, text): elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] elif top == '': if each_line['top'] > buttom: if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] else: if each_line['top'] < top and each_line['top'] > buttom: if abs(last_top - each_line['top']) <= 2: text = text + each_line['text'] elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text): text = text + each_line['text'] else: text = text + '\n' + each_line['text'] last_top = each_line['top'] last_check = each_line['x1'] - page.width * 0.85 return text def drop_empty_cols(self, data): # 删除所有列为空数据的列 transposed_data = list(map(list, zip(*data)))# 转置数据 filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]# 过滤掉空列 result = list(map(list, zip(*filtered_data)))# 再次转置数据 return result @staticmethod def keep_visible_lines(obj): """ 保留可见的线条。 If the object is a ``rect`` type, keep it only if the lines are visible. A visible line is the one having ``non_stroking_color`` not null. """ if obj['object_type'] == 'rect': if obj['non_stroking_color'] is None: return False if obj['width'] < 1 and obj['height'] < 1: return False # return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not None if obj['object_type'] == 'char': return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None return True def extract_text_and_tables(self, page): """ 从给定的页面中提取文本和表格。 """ buttom = 0 page = page.filter(self.keep_visible_lines) tables = page.find_tables() if len(tables) >= 1: # 表格数据 count = len(tables) for table in tables: if table.bbox[3] < buttom: pass else: count -= 1 top = table.bbox[1] text = self.check_lines(page, top, buttom) text_list = text.split('\n') for _t in range(len(text_list)): self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'text', 'inside': text_list[_t]} self.allrow += 1 buttom = table.bbox[3] new_table = table.extract() r_count = 0 for r in range(len(new_table)): row = new_table[r] if row[0] is None: r_count += 1 for c in range(len(row)): if row[c] is not None and row[c] not in ['', ' ']: if new_table[r - r_count][c] is None: new_table[r - r_count][c] = row[c] else: new_table[r - r_count][c] += row[c] new_table[r][c] = None else: r_count = 0 end_table = [] for row in new_table: if row[0] != None: cell_list = [] cell_check = False for cell in row: if cell != None: cell = cell.replace('\n', '') else: cell = '' if cell != '': cell_check = True cell_list.append(cell) if cell_check == True: end_table.append(cell_list) end_table = self.drop_empty_cols(end_table) for row in end_table: self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel', 'inside': str(row)} # self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel', # 'inside': ' '.join(row)} self.allrow += 1 if count == 0: text = self.check_lines(page, '', buttom) text_list = text.split('\n') for _t in range(len(text_list)): self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'text', 'inside': text_list[_t]} self.allrow += 1 else: #文本数据 text = self.check_lines(page, '', '') text_list = text.split('\n') for _t in range(len(text_list)): self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'text', 'inside': pythontext_list[_t]} self.allrow += 1 # 处理页眉和页脚 first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$' end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}' if self.last_num == 0: try: first_text = str(self.all_text[1]['inside']) end_text = str(self.all_text[len(self.all_text) - 1]['inside']) if re.search(first_re, first_text) and not '[' in end_text: self.all_text[1]['type'] = '页眉' if re.search(end_re, end_text) and not '[' in end_text: self.all_text[len(self.all_text) - 1]['type'] = '页脚' except: print(page.page_number) else: try: first_text = str(self.all_text[self.last_num + 2]['inside']) end_text = str(self.all_text[len(self.all_text) - 1]['inside']) if re.search(first_re, first_text) and '[' not in end_text: self.all_text[self.last_num + 2]['type'] = '页眉' if re.search(end_re, end_text) and '[' not in end_text: self.all_text[len(self.all_text) - 1]['type'] = '页脚' except: print(page.page_number) self.last_num = len(self.all_text) - 1 def process_pdf(self): """ 处理整个PDF文档。 """ for i in range(len(self.pdf.pages)): self.extract_text_and_tables(self.pdf.pages[i]) def save_all_text(self, path): """ 将提取的所有文本保存到指定路径的文件中。 """ with open(path, 'w', encoding='utf-8') as file: for key in self.all_text.keys(): file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n') def process_all_pdfs_in_folder(folder_path): """ 处理指定文件夹下的所有PDF文件。 """ file_paths = glob.glob(f'{folder_path}/*') file_paths = sorted(file_paths, reverse=True) for file_path in file_paths: print(file_path) try: processor = PDFProcessor(file_path) processor.process_pdf() save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt') processor.save_all_text(save_path) except: print('check') if __name__ == '__main__': # 需要解析的pdf文件路径 pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__编程客栈爱旭股份__2019年__年度报告.pdf' # pdf解析后的txt内容文件 out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txt' processor = PDFProcessor(pdf_path) processor.process_pdf() processor.save_all_text(out_path)
提取PDF中的图片
提取PDF中的图片并保存到本地
import pdfplumber import os # 定义函数用于提取PDF中的图片并保存 def extract_images_from_pdf(pdf_file, output_folder): # 创建输出文件夹,如果不存在的话 if not os.path.exists(output_folder): os.makedirs(output_folder) with pdfplumber.open(pdf_file) as pdf: # 遍历每一页 for page_number, page in enumerate(pdf.pages, start=1): print(f'页码:{page.page_number}') print(f'页面宽度:{page.width}') print(f'页面高度:{page.height}') # 获取该页的所有图片 images = page.images # 遍历该页的所有图片 for idx, image in enumerate(images, start=1): # 获取图片的二进制数据 image_data = image['stream'].get_data() # 构建图片文件名 image_filename = os.path.join(output_folder, f'image_{page_number}_{idx}.png') # 保存图片到文件 with open(image_filename, 'wb') as f: f.write(image_data) print(f'图片已保存至:{image_filename}') # 示例使用 pdf_file = 'example.pdf' output_folder = 'extracted_images' extract_images_from_pdf(pdf_file, output_folder)
提取pdf 表格文本
保存为excel文件
import pdfplum编程ber from openpyxl import Workbook # 定义函数用于提取PDF中的表格并保存为Excel文件 def extract_tables_to_excel(pdf_file, excel_output_file): with pdfplumber.open(pdf_file) as pdf: workbook = Workbook() sheet = workbook.active # 遍历每一页 for page in pdf.pages: # 提取该页的表格 table = page.extract_table() # 如果表格存在,则将其写入Excel文件 if table: for row in table: sheet.append(row) # 保存Excel文件 workbook.save(excel_output_file) # 示例使用 pdf_file = 'example.pdf' excel_output_file = 'tables.xlsx' extract_tables_to_excel(pdf_file, excel_output_file)
保存为文本文件
import pdfplumber # 定义函数用于提取PDF中的表格并保存为文本文件 def extract_tables_to_text(pdf_file, text_output_file): with pdfplumber.open(pdf_file) as pdf: with open(text_output_file, 'w', encoding='utf-8') as output: # 遍历每一页 for page in pdf.pages: # 提取该页的表格 table = page.extract_table() # 如果表格存在,则将其写入文本文件 if table: for row in table: output.write('\t'.join(str(cell) for cell in row) + '\n') # 示例使用 pdfwww.devze.com_file = 'example.pdf' text_output_file = 'tables.txt' extract_tables_to_text(pdf_file, text_output_file)
提取PDF纯文本
import pdfplumber # 定义函数用于提取PDF中的纯文本并保存为文本文件 def extract_text_to_file(pdf_file, text_output_file): with pdfplumber.open(pdf_file) as pdf: with open(text_output_file, 'w', encoding='utf-8') as output: # 遍历每一页 for page in pdf.pages: # 提取该页的文本 text = page.extract_text() # 如果文本存在,则将其写入文本文件 if text: output.write(text) # 示例使用 pdf_file = 'example.pdf' text_output_file = 'text.txt' extract_text_to_file(pdf_file, text_output_file)
读取富文本txt
python 读取文件函数有三种 read()、readline()、readlines()
- read() 一次性读取所有文本
- readline() 读取第一行的内容
- readlines() 读取全部内容,以数列的格式返回
# 一次性读取所有文本 with open('story.txt', 'r', encoding='utf-8') as f: data = f.read() print(data) # 读取第一行的内容 with open('story.txt', 'r', encoding='utf-8') as f: data = f.readline() print(data) # 读取全部内容,逐行读取并去除换行符 with open('story.txt', 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip('\n') print(line)
以上就是python利用pdfplumber进行pdf文档解析提取的详细内容,更多关于python pdfplumber解析pdf的资料请关注编程客栈(www.devze.com)其它相关文章!
精彩评论