提交文件

2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions
--- a/extract_all_docs.py
+++ b/extract_all_docs.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+提取所有PPT、PDF、XLS文档内容
+"""
+import os
+import json
+import sys
+
+# 尝试导入各种库
+try:
+    from pptx import Presentation
+    HAS_PPTX = True
+except ImportError:
+    HAS_PPTX = False
+    print("Warning: python-pptx not available for .pptx files")
+
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
+    print("Warning: pdfplumber not available")
+
+try:
+    import pandas as pd
+    HAS_PANDAS = True
+except ImportError:
+    HAS_PANDAS = False
+    print("Warning: pandas not available")
+
+try:
+    import openpyxl
+    HAS_OPENPYXL = True
+except ImportError:
+    HAS_OPENPYXL = False
+    print("Warning: openpyxl not available")
+
+try:
+    import xlrd
+    HAS_XLRD = True
+except ImportError:
+    HAS_XLRD = False
+    print("Warning: xlrd not available")
+
+
+def extract_ppt_content(filepath):
+    """提取PPT文件内容"""
+    content = []
+    try:
+        if HAS_PPTX and filepath.endswith('.pptx'):
+            prs = Presentation(filepath)
+            for slide_num, slide in enumerate(prs.slides, 1):
+                slide_content = {
+                    'slide_number': slide_num,
+                    'text': [],
+                    'tables': []
+                }
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        slide_content['text'].append(shape.text.strip())
+                    if shape.has_table:
+                        table_data = []
+                        for row in shape.table.rows:
+                            row_data = [cell.text for cell in row.cells]
+                            table_data.append(row_data)
+                        slide_content['tables'].append(table_data)
+                content.append(slide_content)
+        else:
+            # 对于旧的.ppt格式，尝试使用其他方法
+            content = {'error': f'无法处理旧格式PPT文件: {filepath}'}
+    except Exception as e:
+        content = {'error': str(e)}
+    return content
+
+
+def extract_pdf_content(filepath):
+    """提取PDF文件内容"""
+    content = []
+    try:
+        if HAS_PDFPLUMBER:
+            with pdfplumber.open(filepath) as pdf:
+                for page_num, page in enumerate(pdf.pages, 1):
+                    page_text = page.extract_text()
+                    tables = page.extract_tables()
+                    page_content = {
+                        'page_number': page_num,
+                        'text': page_text if page_text else '',
+                        'tables': tables if tables else []
+                    }
+                    content.append(page_content)
+        else:
+            content = {'error': 'pdfplumber not available'}
+    except Exception as e:
+        content = {'error': str(e)}
+    return content
+
+
+def extract_xls_content(filepath):
+    """提取XLS/XLSX文件内容"""
+    content = {}
+    try:
+        if HAS_PANDAS:
+            # 读取所有sheet
+            if filepath.endswith('.xlsx'):
+                xl_file = pd.ExcelFile(filepath, engine='openpyxl')
+            else:
+                xl_file = pd.ExcelFile(filepath, engine='xlrd')
+            
+            for sheet_name in xl_file.sheet_names:
+                df = pd.read_excel(xl_file, sheet_name=sheet_name)
+                # 转换为可序列化的格式
+                content[sheet_name] = {
+                    'columns': df.columns.tolist(),
+                    'data': df.fillna('').values.tolist()
+                }
+        else:
+            content = {'error': 'pandas not available'}
+    except Exception as e:
+        content = {'error': str(e)}
+    return content
+
+
+def process_directory(base_dir, output_file):
+    """处理参考文档目录中的所有文件"""
+    results = {
+        'ppt_files': {},
+        'pdf_files': {},
+        'xls_files': {}
+    }
+    
+    ref_dir = os.path.join(base_dir, '参考文档')
+    
+    for filename in sorted(os.listdir(ref_dir)):
+        filepath = os.path.join(ref_dir, filename)
+        
+        if not os.path.isfile(filepath):
+            continue
+            
+        print(f"处理文件: {filename}")
+        
+        try:
+            if filename.endswith('.ppt') or filename.endswith('.pptx'):
+                print(f"  -> 提取PPT内容...")
+                content = extract_ppt_content(filepath)
+                results['ppt_files'][filename] = content
+                
+            elif filename.endswith('.pdf'):
+                print(f"  -> 提取PDF内容...")
+                content = extract_pdf_content(filepath)
+                results['pdf_files'][filename] = content
+                
+            elif filename.endswith('.xls') or filename.endswith('.xlsx'):
+                print(f"  -> 提取XLS内容...")
+                content = extract_xls_content(filepath)
+                results['xls_files'][filename] = content
+        except Exception as e:
+            print(f"  -> 错误: {e}")
+            continue
+    
+    # 保存结果
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    
+    print(f"\n所有内容已保存到: {output_file}")
+    return results
+
+
+def main():
+    base_dir = r'd:\医院绩效系统'
+    output_file = os.path.join(base_dir, 'all_docs_content.json')
+    
+    print("=" * 60)
+    print("开始提取所有文档内容")
+    print("=" * 60)
+    
+    results = process_directory(base_dir, output_file)
+    
+    # 打印统计信息
+    print("\n" + "=" * 60)
+    print("提取完成统计:")
+    print(f"  PPT文件: {len(results['ppt_files'])} 个")
+    print(f"  PDF文件: {len(results['pdf_files'])} 个")
+    print(f"  XLS文件: {len(results['xls_files'])} 个")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()