Files
hospital_performance/extract_all_docs.py
2026-02-28 15:16:15 +08:00

189 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
提取所有PPT、PDF、XLS文档内容
"""
import os
import json
import sys
# 尝试导入各种库
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
print("Warning: python-pptx not available for .pptx files")
try:
import pdfplumber
HAS_PDFPLUMBER = True
except ImportError:
HAS_PDFPLUMBER = False
print("Warning: pdfplumber not available")
try:
import pandas as pd
HAS_PANDAS = True
except ImportError:
HAS_PANDAS = False
print("Warning: pandas not available")
try:
import openpyxl
HAS_OPENPYXL = True
except ImportError:
HAS_OPENPYXL = False
print("Warning: openpyxl not available")
try:
import xlrd
HAS_XLRD = True
except ImportError:
HAS_XLRD = False
print("Warning: xlrd not available")
def extract_ppt_content(filepath):
"""提取PPT文件内容"""
content = []
try:
if HAS_PPTX and filepath.endswith('.pptx'):
prs = Presentation(filepath)
for slide_num, slide in enumerate(prs.slides, 1):
slide_content = {
'slide_number': slide_num,
'text': [],
'tables': []
}
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content['text'].append(shape.text.strip())
if shape.has_table:
table_data = []
for row in shape.table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
slide_content['tables'].append(table_data)
content.append(slide_content)
else:
# 对于旧的.ppt格式尝试使用其他方法
content = {'error': f'无法处理旧格式PPT文件: {filepath}'}
except Exception as e:
content = {'error': str(e)}
return content
def extract_pdf_content(filepath):
"""提取PDF文件内容"""
content = []
try:
if HAS_PDFPLUMBER:
with pdfplumber.open(filepath) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
page_text = page.extract_text()
tables = page.extract_tables()
page_content = {
'page_number': page_num,
'text': page_text if page_text else '',
'tables': tables if tables else []
}
content.append(page_content)
else:
content = {'error': 'pdfplumber not available'}
except Exception as e:
content = {'error': str(e)}
return content
def extract_xls_content(filepath):
"""提取XLS/XLSX文件内容"""
content = {}
try:
if HAS_PANDAS:
# 读取所有sheet
if filepath.endswith('.xlsx'):
xl_file = pd.ExcelFile(filepath, engine='openpyxl')
else:
xl_file = pd.ExcelFile(filepath, engine='xlrd')
for sheet_name in xl_file.sheet_names:
df = pd.read_excel(xl_file, sheet_name=sheet_name)
# 转换为可序列化的格式
content[sheet_name] = {
'columns': df.columns.tolist(),
'data': df.fillna('').values.tolist()
}
else:
content = {'error': 'pandas not available'}
except Exception as e:
content = {'error': str(e)}
return content
def process_directory(base_dir, output_file):
"""处理参考文档目录中的所有文件"""
results = {
'ppt_files': {},
'pdf_files': {},
'xls_files': {}
}
ref_dir = os.path.join(base_dir, '参考文档')
for filename in sorted(os.listdir(ref_dir)):
filepath = os.path.join(ref_dir, filename)
if not os.path.isfile(filepath):
continue
print(f"处理文件: {filename}")
try:
if filename.endswith('.ppt') or filename.endswith('.pptx'):
print(f" -> 提取PPT内容...")
content = extract_ppt_content(filepath)
results['ppt_files'][filename] = content
elif filename.endswith('.pdf'):
print(f" -> 提取PDF内容...")
content = extract_pdf_content(filepath)
results['pdf_files'][filename] = content
elif filename.endswith('.xls') or filename.endswith('.xlsx'):
print(f" -> 提取XLS内容...")
content = extract_xls_content(filepath)
results['xls_files'][filename] = content
except Exception as e:
print(f" -> 错误: {e}")
continue
# 保存结果
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n所有内容已保存到: {output_file}")
return results
def main():
base_dir = r'd:\医院绩效系统'
output_file = os.path.join(base_dir, 'all_docs_content.json')
print("=" * 60)
print("开始提取所有文档内容")
print("=" * 60)
results = process_directory(base_dir, output_file)
# 打印统计信息
print("\n" + "=" * 60)
print("提取完成统计:")
print(f" PPT文件: {len(results['ppt_files'])}")
print(f" PDF文件: {len(results['pdf_files'])}")
print(f" XLS文件: {len(results['xls_files'])}")
print("=" * 60)
if __name__ == '__main__':
main()