189 lines
5.9 KiB
Python
189 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
提取所有PPT、PDF、XLS文档内容
|
||
"""
|
||
import os
|
||
import json
|
||
import sys
|
||
|
||
# 尝试导入各种库
|
||
try:
|
||
from pptx import Presentation
|
||
HAS_PPTX = True
|
||
except ImportError:
|
||
HAS_PPTX = False
|
||
print("Warning: python-pptx not available for .pptx files")
|
||
|
||
try:
|
||
import pdfplumber
|
||
HAS_PDFPLUMBER = True
|
||
except ImportError:
|
||
HAS_PDFPLUMBER = False
|
||
print("Warning: pdfplumber not available")
|
||
|
||
try:
|
||
import pandas as pd
|
||
HAS_PANDAS = True
|
||
except ImportError:
|
||
HAS_PANDAS = False
|
||
print("Warning: pandas not available")
|
||
|
||
try:
|
||
import openpyxl
|
||
HAS_OPENPYXL = True
|
||
except ImportError:
|
||
HAS_OPENPYXL = False
|
||
print("Warning: openpyxl not available")
|
||
|
||
try:
|
||
import xlrd
|
||
HAS_XLRD = True
|
||
except ImportError:
|
||
HAS_XLRD = False
|
||
print("Warning: xlrd not available")
|
||
|
||
|
||
def extract_ppt_content(filepath):
|
||
"""提取PPT文件内容"""
|
||
content = []
|
||
try:
|
||
if HAS_PPTX and filepath.endswith('.pptx'):
|
||
prs = Presentation(filepath)
|
||
for slide_num, slide in enumerate(prs.slides, 1):
|
||
slide_content = {
|
||
'slide_number': slide_num,
|
||
'text': [],
|
||
'tables': []
|
||
}
|
||
for shape in slide.shapes:
|
||
if hasattr(shape, "text") and shape.text.strip():
|
||
slide_content['text'].append(shape.text.strip())
|
||
if shape.has_table:
|
||
table_data = []
|
||
for row in shape.table.rows:
|
||
row_data = [cell.text for cell in row.cells]
|
||
table_data.append(row_data)
|
||
slide_content['tables'].append(table_data)
|
||
content.append(slide_content)
|
||
else:
|
||
# 对于旧的.ppt格式,尝试使用其他方法
|
||
content = {'error': f'无法处理旧格式PPT文件: {filepath}'}
|
||
except Exception as e:
|
||
content = {'error': str(e)}
|
||
return content
|
||
|
||
|
||
def extract_pdf_content(filepath):
|
||
"""提取PDF文件内容"""
|
||
content = []
|
||
try:
|
||
if HAS_PDFPLUMBER:
|
||
with pdfplumber.open(filepath) as pdf:
|
||
for page_num, page in enumerate(pdf.pages, 1):
|
||
page_text = page.extract_text()
|
||
tables = page.extract_tables()
|
||
page_content = {
|
||
'page_number': page_num,
|
||
'text': page_text if page_text else '',
|
||
'tables': tables if tables else []
|
||
}
|
||
content.append(page_content)
|
||
else:
|
||
content = {'error': 'pdfplumber not available'}
|
||
except Exception as e:
|
||
content = {'error': str(e)}
|
||
return content
|
||
|
||
|
||
def extract_xls_content(filepath):
|
||
"""提取XLS/XLSX文件内容"""
|
||
content = {}
|
||
try:
|
||
if HAS_PANDAS:
|
||
# 读取所有sheet
|
||
if filepath.endswith('.xlsx'):
|
||
xl_file = pd.ExcelFile(filepath, engine='openpyxl')
|
||
else:
|
||
xl_file = pd.ExcelFile(filepath, engine='xlrd')
|
||
|
||
for sheet_name in xl_file.sheet_names:
|
||
df = pd.read_excel(xl_file, sheet_name=sheet_name)
|
||
# 转换为可序列化的格式
|
||
content[sheet_name] = {
|
||
'columns': df.columns.tolist(),
|
||
'data': df.fillna('').values.tolist()
|
||
}
|
||
else:
|
||
content = {'error': 'pandas not available'}
|
||
except Exception as e:
|
||
content = {'error': str(e)}
|
||
return content
|
||
|
||
|
||
def process_directory(base_dir, output_file):
|
||
"""处理参考文档目录中的所有文件"""
|
||
results = {
|
||
'ppt_files': {},
|
||
'pdf_files': {},
|
||
'xls_files': {}
|
||
}
|
||
|
||
ref_dir = os.path.join(base_dir, '参考文档')
|
||
|
||
for filename in sorted(os.listdir(ref_dir)):
|
||
filepath = os.path.join(ref_dir, filename)
|
||
|
||
if not os.path.isfile(filepath):
|
||
continue
|
||
|
||
print(f"处理文件: {filename}")
|
||
|
||
try:
|
||
if filename.endswith('.ppt') or filename.endswith('.pptx'):
|
||
print(f" -> 提取PPT内容...")
|
||
content = extract_ppt_content(filepath)
|
||
results['ppt_files'][filename] = content
|
||
|
||
elif filename.endswith('.pdf'):
|
||
print(f" -> 提取PDF内容...")
|
||
content = extract_pdf_content(filepath)
|
||
results['pdf_files'][filename] = content
|
||
|
||
elif filename.endswith('.xls') or filename.endswith('.xlsx'):
|
||
print(f" -> 提取XLS内容...")
|
||
content = extract_xls_content(filepath)
|
||
results['xls_files'][filename] = content
|
||
except Exception as e:
|
||
print(f" -> 错误: {e}")
|
||
continue
|
||
|
||
# 保存结果
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\n所有内容已保存到: {output_file}")
|
||
return results
|
||
|
||
|
||
def main():
|
||
base_dir = r'd:\医院绩效系统'
|
||
output_file = os.path.join(base_dir, 'all_docs_content.json')
|
||
|
||
print("=" * 60)
|
||
print("开始提取所有文档内容")
|
||
print("=" * 60)
|
||
|
||
results = process_directory(base_dir, output_file)
|
||
|
||
# 打印统计信息
|
||
print("\n" + "=" * 60)
|
||
print("提取完成统计:")
|
||
print(f" PPT文件: {len(results['ppt_files'])} 个")
|
||
print(f" PDF文件: {len(results['pdf_files'])} 个")
|
||
print(f" XLS文件: {len(results['xls_files'])} 个")
|
||
print("=" * 60)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|