提交文件
This commit is contained in:
198
extract_doc.py
Normal file
198
extract_doc.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
提取 .doc (Word 97-2003) 文件内容的脚本
|
||||
支持多种方法:OLE流提取、结构化解析
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import olefile
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_text_from_ole(filepath):
|
||||
"""从OLE文件中提取文本流"""
|
||||
try:
|
||||
ole = olefile.OleFileIO(filepath)
|
||||
# 检查是否是Word文档
|
||||
if ole.exists('WordDocument'):
|
||||
# 尝试读取主文档流
|
||||
word_stream = ole.openstream('WordDocument')
|
||||
data = word_stream.read()
|
||||
|
||||
# 提取文本 - 这是一个简化的方法
|
||||
# 实际的.doc文件结构非常复杂
|
||||
text = extract_text_from_fib(data, ole)
|
||||
return text
|
||||
else:
|
||||
return "不是有效的Word文档"
|
||||
except Exception as e:
|
||||
return f"错误: {str(e)}"
|
||||
|
||||
|
||||
def extract_text_from_fib(data, ole):
|
||||
"""从FIB(文件信息块)中提取文本"""
|
||||
try:
|
||||
# 读取FIB头
|
||||
# Word 97-2003 的FIB结构
|
||||
# 偏移量24-26: ccpText (主文档文本字符数)
|
||||
# 偏移量26-28: ccpFtn (脚注文本字符数)
|
||||
|
||||
# 获取文本位置表
|
||||
# 这是一个简化的提取方法
|
||||
|
||||
# 尝试读取1Table或0Table流
|
||||
table_name = '1Table' if ole.exists('1Table') else '0Table'
|
||||
|
||||
if ole.exists(table_name):
|
||||
table_stream = ole.openstream(table_name)
|
||||
table_data = table_stream.read()
|
||||
|
||||
# PLC (Piece Descriptor) 结构解析
|
||||
# 从表格中提取文本位置信息
|
||||
|
||||
# 简单方法:提取所有可打印的Unicode文本
|
||||
text_parts = []
|
||||
|
||||
# 从WordDocument流中提取文本
|
||||
# 使用正则表达式匹配可能的文本
|
||||
try:
|
||||
# 尝试UTF-16解码
|
||||
decoded = data.decode('utf-16-le', errors='ignore')
|
||||
# 过滤可打印字符
|
||||
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
|
||||
if printable.strip():
|
||||
text_parts.append(printable)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 从表格流中提取
|
||||
try:
|
||||
decoded = table_data.decode('utf-16-le', errors='ignore')
|
||||
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
|
||||
if printable.strip():
|
||||
text_parts.append(printable)
|
||||
except:
|
||||
pass
|
||||
|
||||
return '\n'.join(text_parts) if text_parts else "无法提取文本内容"
|
||||
|
||||
return "无法找到表格流"
|
||||
|
||||
except Exception as e:
|
||||
return f"解析错误: {str(e)}"
|
||||
|
||||
|
||||
def extract_text_simple(filepath):
|
||||
"""简单方法:提取所有可读文本"""
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
data = f.read()
|
||||
|
||||
# 尝试多种编码
|
||||
text_parts = []
|
||||
|
||||
# UTF-16 LE (Windows Unicode)
|
||||
try:
|
||||
decoded = data.decode('utf-16-le', errors='ignore')
|
||||
# 中文字符范围:\u4e00-\u9fff
|
||||
# 英文和标点:\u0020-\u007e
|
||||
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
|
||||
if chinese:
|
||||
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
|
||||
except:
|
||||
pass
|
||||
|
||||
# GB2312/GBK
|
||||
try:
|
||||
decoded = data.decode('gbk', errors='ignore')
|
||||
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
|
||||
if chinese:
|
||||
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
|
||||
except:
|
||||
pass
|
||||
|
||||
# 去重并保持顺序
|
||||
seen = set()
|
||||
unique_parts = []
|
||||
for part in text_parts:
|
||||
if part not in seen:
|
||||
seen.add(part)
|
||||
unique_parts.append(part)
|
||||
|
||||
return '\n'.join(unique_parts) if unique_parts else "无法提取文本"
|
||||
|
||||
except Exception as e:
|
||||
return f"错误: {str(e)}"
|
||||
|
||||
|
||||
def process_doc_file(filepath):
|
||||
"""处理单个.doc文件"""
|
||||
print(f"\n处理文件: {filepath}")
|
||||
print("=" * 50)
|
||||
|
||||
# 方法1: OLE解析
|
||||
text_ole = extract_text_from_ole(filepath)
|
||||
|
||||
# 方法2: 简单提取
|
||||
text_simple = extract_text_simple(filepath)
|
||||
|
||||
return {
|
||||
'ole': text_ole,
|
||||
'simple': text_simple
|
||||
}
|
||||
|
||||
|
||||
def process_directory(directory, output_file=None):
|
||||
"""处理目录下所有.doc文件"""
|
||||
results = {}
|
||||
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
|
||||
filepath = os.path.join(root, file)
|
||||
try:
|
||||
result = process_doc_file(filepath)
|
||||
results[file] = result
|
||||
except Exception as e:
|
||||
results[file] = {'error': str(e)}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
if len(sys.argv) < 2:
|
||||
print("用法: python extract_doc.py <文件或目录路径>")
|
||||
print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
|
||||
sys.exit(1)
|
||||
|
||||
path = sys.argv[1]
|
||||
|
||||
if os.path.isfile(path):
|
||||
result = process_doc_file(path)
|
||||
print("\n--- OLE解析结果 ---")
|
||||
print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
|
||||
print("\n--- 简单提取结果 ---")
|
||||
print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])
|
||||
|
||||
elif os.path.isdir(path):
|
||||
results = process_directory(path)
|
||||
print(f"\n处理了 {len(results)} 个.doc文件")
|
||||
|
||||
for filename, result in results.items():
|
||||
print(f"\n{filename}:")
|
||||
if 'error' in result:
|
||||
print(f" 错误: {result['error']}")
|
||||
else:
|
||||
preview = result['simple'][:200] if result['simple'] else "无内容"
|
||||
print(f" 预览: {preview}...")
|
||||
else:
|
||||
print(f"路径不存在: {path}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user