提交文件

This commit is contained in:
2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions

188
extract_all_docs.py Normal file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
提取所有PPT、PDF、XLS文档内容
"""
import os
import json
import sys
# 尝试导入各种库
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
print("Warning: python-pptx not available for .pptx files")
try:
import pdfplumber
HAS_PDFPLUMBER = True
except ImportError:
HAS_PDFPLUMBER = False
print("Warning: pdfplumber not available")
try:
import pandas as pd
HAS_PANDAS = True
except ImportError:
HAS_PANDAS = False
print("Warning: pandas not available")
try:
import openpyxl
HAS_OPENPYXL = True
except ImportError:
HAS_OPENPYXL = False
print("Warning: openpyxl not available")
try:
import xlrd
HAS_XLRD = True
except ImportError:
HAS_XLRD = False
print("Warning: xlrd not available")
def extract_ppt_content(filepath):
"""提取PPT文件内容"""
content = []
try:
if HAS_PPTX and filepath.endswith('.pptx'):
prs = Presentation(filepath)
for slide_num, slide in enumerate(prs.slides, 1):
slide_content = {
'slide_number': slide_num,
'text': [],
'tables': []
}
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content['text'].append(shape.text.strip())
if shape.has_table:
table_data = []
for row in shape.table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
slide_content['tables'].append(table_data)
content.append(slide_content)
else:
# 对于旧的.ppt格式尝试使用其他方法
content = {'error': f'无法处理旧格式PPT文件: {filepath}'}
except Exception as e:
content = {'error': str(e)}
return content
def extract_pdf_content(filepath):
"""提取PDF文件内容"""
content = []
try:
if HAS_PDFPLUMBER:
with pdfplumber.open(filepath) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
page_text = page.extract_text()
tables = page.extract_tables()
page_content = {
'page_number': page_num,
'text': page_text if page_text else '',
'tables': tables if tables else []
}
content.append(page_content)
else:
content = {'error': 'pdfplumber not available'}
except Exception as e:
content = {'error': str(e)}
return content
def extract_xls_content(filepath):
"""提取XLS/XLSX文件内容"""
content = {}
try:
if HAS_PANDAS:
# 读取所有sheet
if filepath.endswith('.xlsx'):
xl_file = pd.ExcelFile(filepath, engine='openpyxl')
else:
xl_file = pd.ExcelFile(filepath, engine='xlrd')
for sheet_name in xl_file.sheet_names:
df = pd.read_excel(xl_file, sheet_name=sheet_name)
# 转换为可序列化的格式
content[sheet_name] = {
'columns': df.columns.tolist(),
'data': df.fillna('').values.tolist()
}
else:
content = {'error': 'pandas not available'}
except Exception as e:
content = {'error': str(e)}
return content
def process_directory(base_dir, output_file):
"""处理参考文档目录中的所有文件"""
results = {
'ppt_files': {},
'pdf_files': {},
'xls_files': {}
}
ref_dir = os.path.join(base_dir, '参考文档')
for filename in sorted(os.listdir(ref_dir)):
filepath = os.path.join(ref_dir, filename)
if not os.path.isfile(filepath):
continue
print(f"处理文件: {filename}")
try:
if filename.endswith('.ppt') or filename.endswith('.pptx'):
print(f" -> 提取PPT内容...")
content = extract_ppt_content(filepath)
results['ppt_files'][filename] = content
elif filename.endswith('.pdf'):
print(f" -> 提取PDF内容...")
content = extract_pdf_content(filepath)
results['pdf_files'][filename] = content
elif filename.endswith('.xls') or filename.endswith('.xlsx'):
print(f" -> 提取XLS内容...")
content = extract_xls_content(filepath)
results['xls_files'][filename] = content
except Exception as e:
print(f" -> 错误: {e}")
continue
# 保存结果
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n所有内容已保存到: {output_file}")
return results
def main():
base_dir = r'd:\医院绩效系统'
output_file = os.path.join(base_dir, 'all_docs_content.json')
print("=" * 60)
print("开始提取所有文档内容")
print("=" * 60)
results = process_directory(base_dir, output_file)
# 打印统计信息
print("\n" + "=" * 60)
print("提取完成统计:")
print(f" PPT文件: {len(results['ppt_files'])}")
print(f" PDF文件: {len(results['pdf_files'])}")
print(f" XLS文件: {len(results['xls_files'])}")
print("=" * 60)
if __name__ == '__main__':
main()