#!/usr/bin/env python3 """ 提取所有PPT、PDF、XLS文档内容 """ import os import json import sys # 尝试导入各种库 try: from pptx import Presentation HAS_PPTX = True except ImportError: HAS_PPTX = False print("Warning: python-pptx not available for .pptx files") try: import pdfplumber HAS_PDFPLUMBER = True except ImportError: HAS_PDFPLUMBER = False print("Warning: pdfplumber not available") try: import pandas as pd HAS_PANDAS = True except ImportError: HAS_PANDAS = False print("Warning: pandas not available") try: import openpyxl HAS_OPENPYXL = True except ImportError: HAS_OPENPYXL = False print("Warning: openpyxl not available") try: import xlrd HAS_XLRD = True except ImportError: HAS_XLRD = False print("Warning: xlrd not available") def extract_ppt_content(filepath): """提取PPT文件内容""" content = [] try: if HAS_PPTX and filepath.endswith('.pptx'): prs = Presentation(filepath) for slide_num, slide in enumerate(prs.slides, 1): slide_content = { 'slide_number': slide_num, 'text': [], 'tables': [] } for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_content['text'].append(shape.text.strip()) if shape.has_table: table_data = [] for row in shape.table.rows: row_data = [cell.text for cell in row.cells] table_data.append(row_data) slide_content['tables'].append(table_data) content.append(slide_content) else: # 对于旧的.ppt格式,尝试使用其他方法 content = {'error': f'无法处理旧格式PPT文件: {filepath}'} except Exception as e: content = {'error': str(e)} return content def extract_pdf_content(filepath): """提取PDF文件内容""" content = [] try: if HAS_PDFPLUMBER: with pdfplumber.open(filepath) as pdf: for page_num, page in enumerate(pdf.pages, 1): page_text = page.extract_text() tables = page.extract_tables() page_content = { 'page_number': page_num, 'text': page_text if page_text else '', 'tables': tables if tables else [] } content.append(page_content) else: content = {'error': 'pdfplumber not available'} except Exception as e: content = {'error': str(e)} return content def extract_xls_content(filepath): """提取XLS/XLSX文件内容""" content = {} try: if HAS_PANDAS: # 读取所有sheet if filepath.endswith('.xlsx'): xl_file = pd.ExcelFile(filepath, engine='openpyxl') else: xl_file = pd.ExcelFile(filepath, engine='xlrd') for sheet_name in xl_file.sheet_names: df = pd.read_excel(xl_file, sheet_name=sheet_name) # 转换为可序列化的格式 content[sheet_name] = { 'columns': df.columns.tolist(), 'data': df.fillna('').values.tolist() } else: content = {'error': 'pandas not available'} except Exception as e: content = {'error': str(e)} return content def process_directory(base_dir, output_file): """处理参考文档目录中的所有文件""" results = { 'ppt_files': {}, 'pdf_files': {}, 'xls_files': {} } ref_dir = os.path.join(base_dir, '参考文档') for filename in sorted(os.listdir(ref_dir)): filepath = os.path.join(ref_dir, filename) if not os.path.isfile(filepath): continue print(f"处理文件: {filename}") try: if filename.endswith('.ppt') or filename.endswith('.pptx'): print(f" -> 提取PPT内容...") content = extract_ppt_content(filepath) results['ppt_files'][filename] = content elif filename.endswith('.pdf'): print(f" -> 提取PDF内容...") content = extract_pdf_content(filepath) results['pdf_files'][filename] = content elif filename.endswith('.xls') or filename.endswith('.xlsx'): print(f" -> 提取XLS内容...") content = extract_xls_content(filepath) results['xls_files'][filename] = content except Exception as e: print(f" -> 错误: {e}") continue # 保存结果 with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n所有内容已保存到: {output_file}") return results def main(): base_dir = r'd:\医院绩效系统' output_file = os.path.join(base_dir, 'all_docs_content.json') print("=" * 60) print("开始提取所有文档内容") print("=" * 60) results = process_directory(base_dir, output_file) # 打印统计信息 print("\n" + "=" * 60) print("提取完成统计:") print(f" PPT文件: {len(results['ppt_files'])} 个") print(f" PDF文件: {len(results['pdf_files'])} 个") print(f" XLS文件: {len(results['xls_files'])} 个") print("=" * 60) if __name__ == '__main__': main()