提交文件

This commit is contained in:
2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions

198
extract_doc.py Normal file
View File

@@ -0,0 +1,198 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
提取 .doc (Word 97-2003) 文件内容的脚本
支持多种方法OLE流提取、结构化解析
"""
import os
import sys
import olefile
import re
from pathlib import Path
def extract_text_from_ole(filepath):
"""从OLE文件中提取文本流"""
try:
ole = olefile.OleFileIO(filepath)
# 检查是否是Word文档
if ole.exists('WordDocument'):
# 尝试读取主文档流
word_stream = ole.openstream('WordDocument')
data = word_stream.read()
# 提取文本 - 这是一个简化的方法
# 实际的.doc文件结构非常复杂
text = extract_text_from_fib(data, ole)
return text
else:
return "不是有效的Word文档"
except Exception as e:
return f"错误: {str(e)}"
def extract_text_from_fib(data, ole):
"""从FIB(文件信息块)中提取文本"""
try:
# 读取FIB头
# Word 97-2003 的FIB结构
# 偏移量24-26: ccpText (主文档文本字符数)
# 偏移量26-28: ccpFtn (脚注文本字符数)
# 获取文本位置表
# 这是一个简化的提取方法
# 尝试读取1Table或0Table流
table_name = '1Table' if ole.exists('1Table') else '0Table'
if ole.exists(table_name):
table_stream = ole.openstream(table_name)
table_data = table_stream.read()
# PLC (Piece Descriptor) 结构解析
# 从表格中提取文本位置信息
# 简单方法提取所有可打印的Unicode文本
text_parts = []
# 从WordDocument流中提取文本
# 使用正则表达式匹配可能的文本
try:
# 尝试UTF-16解码
decoded = data.decode('utf-16-le', errors='ignore')
# 过滤可打印字符
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
if printable.strip():
text_parts.append(printable)
except:
pass
# 从表格流中提取
try:
decoded = table_data.decode('utf-16-le', errors='ignore')
printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
if printable.strip():
text_parts.append(printable)
except:
pass
return '\n'.join(text_parts) if text_parts else "无法提取文本内容"
return "无法找到表格流"
except Exception as e:
return f"解析错误: {str(e)}"
def extract_text_simple(filepath):
"""简单方法:提取所有可读文本"""
try:
with open(filepath, 'rb') as f:
data = f.read()
# 尝试多种编码
text_parts = []
# UTF-16 LE (Windows Unicode)
try:
decoded = data.decode('utf-16-le', errors='ignore')
# 中文字符范围:\u4e00-\u9fff
# 英文和标点:\u0020-\u007e
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
if chinese:
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
except:
pass
# GB2312/GBK
try:
decoded = data.decode('gbk', errors='ignore')
chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded)
if chinese:
text_parts.extend([c for c in chinese if len(c.strip()) > 1])
except:
pass
# 去重并保持顺序
seen = set()
unique_parts = []
for part in text_parts:
if part not in seen:
seen.add(part)
unique_parts.append(part)
return '\n'.join(unique_parts) if unique_parts else "无法提取文本"
except Exception as e:
return f"错误: {str(e)}"
def process_doc_file(filepath):
"""处理单个.doc文件"""
print(f"\n处理文件: {filepath}")
print("=" * 50)
# 方法1: OLE解析
text_ole = extract_text_from_ole(filepath)
# 方法2: 简单提取
text_simple = extract_text_simple(filepath)
return {
'ole': text_ole,
'simple': text_simple
}
def process_directory(directory, output_file=None):
"""处理目录下所有.doc文件"""
results = {}
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
filepath = os.path.join(root, file)
try:
result = process_doc_file(filepath)
results[file] = result
except Exception as e:
results[file] = {'error': str(e)}
return results
def main():
"""主函数"""
if len(sys.argv) < 2:
print("用法: python extract_doc.py <文件或目录路径>")
print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
sys.exit(1)
path = sys.argv[1]
if os.path.isfile(path):
result = process_doc_file(path)
print("\n--- OLE解析结果 ---")
print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
print("\n--- 简单提取结果 ---")
print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])
elif os.path.isdir(path):
results = process_directory(path)
print(f"\n处理了 {len(results)} 个.doc文件")
for filename, result in results.items():
print(f"\n{filename}:")
if 'error' in result:
print(f" 错误: {result['error']}")
else:
preview = result['simple'][:200] if result['simple'] else "无内容"
print(f" 预览: {preview}...")
else:
print(f"路径不存在: {path}")
sys.exit(1)
if __name__ == '__main__':
main()