Files
hospital_performance/extract_doc.py
2026-02-28 15:16:15 +08:00

199 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
提取 .doc (Word 97-2003) 文件内容的脚本
支持多种方法OLE流提取、结构化解析
"""
import os
import sys
import olefile
import re
from pathlib import Path
def extract_text_from_ole(filepath):
"""从OLE文件中提取文本流"""
try:
ole = olefile.OleFileIO(filepath)
# 检查是否是Word文档
if ole.exists('WordDocument'):
# 尝试读取主文档流
word_stream = ole.openstream('WordDocument')
data = word_stream.read()
# 提取文本 - 这是一个简化的方法
# 实际的.doc文件结构非常复杂
text = extract_text_from_fib(data, ole)
return text
else:
return "不是有效的Word文档"
except Exception as e:
return f"错误: {str(e)}"
def extract_text_from_fib(data, ole):
"""从FIB(文件信息块)中提取文本"""
try:
# 读取FIB头
# Word 97-2003 的FIB结构
# 偏移量24-26: ccpText (主文档文本字符数)
# 偏移量26-28: ccpFtn (脚注文本字符数)
# 获取文本位置表
# 这是一个简化的提取方法
# 尝试读取1Table或0Table流
table_name = '1Table' if ole.exists('1Table') else '0Table'
if ole.exists(table_name):
table_stream = ole.openstream(table_name)
table_data = table_stream.read()
# PLC (Piece Descriptor) 结构解析
# 从表格中提取文本位置信息
# 简单方法提取所有可打印的Unicode文本
text_parts = []
# 从WordDocument流中提取文本
# 使用正则表达式匹配可能的文本
try:
# 尝试UTF-16解码
decoded = data.decode('utf-16-le', errors='ignore')
# 过滤可打印字符
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
if printable.strip():
text_parts.append(printable)
except:
pass
# 从表格流中提取
try:
decoded = table_data.decode('utf-16-le', errors='ignore')
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
if printable.strip():
text_parts.append(printable)
except:
pass
return '\n'.join(text_parts) if text_parts else "无法提取文本内容"
return "无法找到表格流"
except Exception as e:
return f"解析错误: {str(e)}"
def extract_text_simple(filepath):
"""简单方法:提取所有可读文本"""
try:
with open(filepath, 'rb') as f:
data = f.read()
# 尝试多种编码
text_parts = []
# UTF-16 LE (Windows Unicode)
try:
decoded = data.decode('utf-16-le', errors='ignore')
# 中文字符范围:\u4e00-\u9fff
# 英文和标点:\u0020-\u007e
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
if chinese:
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
except:
pass
# GB2312/GBK
try:
decoded = data.decode('gbk', errors='ignore')
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
if chinese:
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
except:
pass
# 去重并保持顺序
seen = set()
unique_parts = []
for part in text_parts:
if part not in seen:
seen.add(part)
unique_parts.append(part)
return '\n'.join(unique_parts) if unique_parts else "无法提取文本"
except Exception as e:
return f"错误: {str(e)}"
def process_doc_file(filepath):
"""处理单个.doc文件"""
print(f"\n处理文件: {filepath}")
print("=" * 50)
# 方法1: OLE解析
text_ole = extract_text_from_ole(filepath)
# 方法2: 简单提取
text_simple = extract_text_simple(filepath)
return {
'ole': text_ole,
'simple': text_simple
}
def process_directory(directory, output_file=None):
"""处理目录下所有.doc文件"""
results = {}
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
filepath = os.path.join(root, file)
try:
result = process_doc_file(filepath)
results[file] = result
except Exception as e:
results[file] = {'error': str(e)}
return results
def main():
"""主函数"""
if len(sys.argv) < 2:
print("用法: python extract_doc.py <文件或目录路径>")
print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
sys.exit(1)
path = sys.argv[1]
if os.path.isfile(path):
result = process_doc_file(path)
print("\n--- OLE解析结果 ---")
print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
print("\n--- 简单提取结果 ---")
print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])
elif os.path.isdir(path):
results = process_directory(path)
print(f"\n处理了 {len(results)} 个.doc文件")
for filename, result in results.items():
print(f"\n{filename}:")
if 'error' in result:
print(f" 错误: {result['error']}")
else:
preview = result['simple'][:200] if result['simple'] else "无内容"
print(f" 预览: {preview}...")
else:
print(f"路径不存在: {path}")
sys.exit(1)
if __name__ == '__main__':
main()