199 lines
6.4 KiB
Python
199 lines
6.4 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
提取 .doc (Word 97-2003) 文件内容的脚本
|
||
支持多种方法:OLE流提取、结构化解析
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import olefile
|
||
import re
|
||
from pathlib import Path
|
||
|
||
|
||
def extract_text_from_ole(filepath):
|
||
"""从OLE文件中提取文本流"""
|
||
try:
|
||
ole = olefile.OleFileIO(filepath)
|
||
# 检查是否是Word文档
|
||
if ole.exists('WordDocument'):
|
||
# 尝试读取主文档流
|
||
word_stream = ole.openstream('WordDocument')
|
||
data = word_stream.read()
|
||
|
||
# 提取文本 - 这是一个简化的方法
|
||
# 实际的.doc文件结构非常复杂
|
||
text = extract_text_from_fib(data, ole)
|
||
return text
|
||
else:
|
||
return "不是有效的Word文档"
|
||
except Exception as e:
|
||
return f"错误: {str(e)}"
|
||
|
||
|
||
def extract_text_from_fib(data, ole):
|
||
"""从FIB(文件信息块)中提取文本"""
|
||
try:
|
||
# 读取FIB头
|
||
# Word 97-2003 的FIB结构
|
||
# 偏移量24-26: ccpText (主文档文本字符数)
|
||
# 偏移量26-28: ccpFtn (脚注文本字符数)
|
||
|
||
# 获取文本位置表
|
||
# 这是一个简化的提取方法
|
||
|
||
# 尝试读取1Table或0Table流
|
||
table_name = '1Table' if ole.exists('1Table') else '0Table'
|
||
|
||
if ole.exists(table_name):
|
||
table_stream = ole.openstream(table_name)
|
||
table_data = table_stream.read()
|
||
|
||
# PLC (Piece Descriptor) 结构解析
|
||
# 从表格中提取文本位置信息
|
||
|
||
# 简单方法:提取所有可打印的Unicode文本
|
||
text_parts = []
|
||
|
||
# 从WordDocument流中提取文本
|
||
# 使用正则表达式匹配可能的文本
|
||
try:
|
||
# 尝试UTF-16解码
|
||
decoded = data.decode('utf-16-le', errors='ignore')
|
||
# 过滤可打印字符
|
||
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
|
||
if printable.strip():
|
||
text_parts.append(printable)
|
||
except:
|
||
pass
|
||
|
||
# 从表格流中提取
|
||
try:
|
||
decoded = table_data.decode('utf-16-le', errors='ignore')
|
||
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
|
||
if printable.strip():
|
||
text_parts.append(printable)
|
||
except:
|
||
pass
|
||
|
||
return '\n'.join(text_parts) if text_parts else "无法提取文本内容"
|
||
|
||
return "无法找到表格流"
|
||
|
||
except Exception as e:
|
||
return f"解析错误: {str(e)}"
|
||
|
||
|
||
def extract_text_simple(filepath):
|
||
"""简单方法:提取所有可读文本"""
|
||
try:
|
||
with open(filepath, 'rb') as f:
|
||
data = f.read()
|
||
|
||
# 尝试多种编码
|
||
text_parts = []
|
||
|
||
# UTF-16 LE (Windows Unicode)
|
||
try:
|
||
decoded = data.decode('utf-16-le', errors='ignore')
|
||
# 中文字符范围:\u4e00-\u9fff
|
||
# 英文和标点:\u0020-\u007e
|
||
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
|
||
if chinese:
|
||
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
|
||
except:
|
||
pass
|
||
|
||
# GB2312/GBK
|
||
try:
|
||
decoded = data.decode('gbk', errors='ignore')
|
||
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
|
||
if chinese:
|
||
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
|
||
except:
|
||
pass
|
||
|
||
# 去重并保持顺序
|
||
seen = set()
|
||
unique_parts = []
|
||
for part in text_parts:
|
||
if part not in seen:
|
||
seen.add(part)
|
||
unique_parts.append(part)
|
||
|
||
return '\n'.join(unique_parts) if unique_parts else "无法提取文本"
|
||
|
||
except Exception as e:
|
||
return f"错误: {str(e)}"
|
||
|
||
|
||
def process_doc_file(filepath):
|
||
"""处理单个.doc文件"""
|
||
print(f"\n处理文件: {filepath}")
|
||
print("=" * 50)
|
||
|
||
# 方法1: OLE解析
|
||
text_ole = extract_text_from_ole(filepath)
|
||
|
||
# 方法2: 简单提取
|
||
text_simple = extract_text_simple(filepath)
|
||
|
||
return {
|
||
'ole': text_ole,
|
||
'simple': text_simple
|
||
}
|
||
|
||
|
||
def process_directory(directory, output_file=None):
|
||
"""处理目录下所有.doc文件"""
|
||
results = {}
|
||
|
||
for root, dirs, files in os.walk(directory):
|
||
for file in files:
|
||
if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
|
||
filepath = os.path.join(root, file)
|
||
try:
|
||
result = process_doc_file(filepath)
|
||
results[file] = result
|
||
except Exception as e:
|
||
results[file] = {'error': str(e)}
|
||
|
||
return results
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
if len(sys.argv) < 2:
|
||
print("用法: python extract_doc.py <文件或目录路径>")
|
||
print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
|
||
sys.exit(1)
|
||
|
||
path = sys.argv[1]
|
||
|
||
if os.path.isfile(path):
|
||
result = process_doc_file(path)
|
||
print("\n--- OLE解析结果 ---")
|
||
print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
|
||
print("\n--- 简单提取结果 ---")
|
||
print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])
|
||
|
||
elif os.path.isdir(path):
|
||
results = process_directory(path)
|
||
print(f"\n处理了 {len(results)} 个.doc文件")
|
||
|
||
for filename, result in results.items():
|
||
print(f"\n{filename}:")
|
||
if 'error' in result:
|
||
print(f" 错误: {result['error']}")
|
||
else:
|
||
preview = result['simple'][:200] if result['simple'] else "无内容"
|
||
print(f" 预览: {preview}...")
|
||
else:
|
||
print(f"路径不存在: {path}")
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|