#!/usr/bin/env python # -*- coding: utf-8 -*- """ 提取 .doc (Word 97-2003) 文件内容的脚本 支持多种方法:OLE流提取、结构化解析 """ import os import sys import olefile import re from pathlib import Path def extract_text_from_ole(filepath): """从OLE文件中提取文本流""" try: ole = olefile.OleFileIO(filepath) # 检查是否是Word文档 if ole.exists('WordDocument'): # 尝试读取主文档流 word_stream = ole.openstream('WordDocument') data = word_stream.read() # 提取文本 - 这是一个简化的方法 # 实际的.doc文件结构非常复杂 text = extract_text_from_fib(data, ole) return text else: return "不是有效的Word文档" except Exception as e: return f"错误: {str(e)}" def extract_text_from_fib(data, ole): """从FIB(文件信息块)中提取文本""" try: # 读取FIB头 # Word 97-2003 的FIB结构 # 偏移量24-26: ccpText (主文档文本字符数) # 偏移量26-28: ccpFtn (脚注文本字符数) # 获取文本位置表 # 这是一个简化的提取方法 # 尝试读取1Table或0Table流 table_name = '1Table' if ole.exists('1Table') else '0Table' if ole.exists(table_name): table_stream = ole.openstream(table_name) table_data = table_stream.read() # PLC (Piece Descriptor) 结构解析 # 从表格中提取文本位置信息 # 简单方法:提取所有可打印的Unicode文本 text_parts = [] # 从WordDocument流中提取文本 # 使用正则表达式匹配可能的文本 try: # 尝试UTF-16解码 decoded = data.decode('utf-16-le', errors='ignore') # 过滤可打印字符 printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded) if printable.strip(): text_parts.append(printable) except: pass # 从表格流中提取 try: decoded = table_data.decode('utf-16-le', errors='ignore') printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded) if printable.strip(): text_parts.append(printable) except: pass return '\n'.join(text_parts) if text_parts else "无法提取文本内容" return "无法找到表格流" except Exception as e: return f"解析错误: {str(e)}" def extract_text_simple(filepath): """简单方法:提取所有可读文本""" try: with open(filepath, 'rb') as f: data = f.read() # 尝试多种编码 text_parts = [] # UTF-16 LE (Windows Unicode) try: decoded = data.decode('utf-16-le', errors='ignore') # 中文字符范围:\u4e00-\u9fff # 英文和标点:\u0020-\u007e chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded) if chinese: text_parts.extend([c for c in chinese if len(c.strip()) > 1]) except: pass # GB2312/GBK try: decoded = data.decode('gbk', errors='ignore') chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()()。,;:!?、]+', decoded) if chinese: text_parts.extend([c for c in chinese if len(c.strip()) > 1]) except: pass # 去重并保持顺序 seen = set() unique_parts = [] for part in text_parts: if part not in seen: seen.add(part) unique_parts.append(part) return '\n'.join(unique_parts) if unique_parts else "无法提取文本" except Exception as e: return f"错误: {str(e)}" def process_doc_file(filepath): """处理单个.doc文件""" print(f"\n处理文件: {filepath}") print("=" * 50) # 方法1: OLE解析 text_ole = extract_text_from_ole(filepath) # 方法2: 简单提取 text_simple = extract_text_simple(filepath) return { 'ole': text_ole, 'simple': text_simple } def process_directory(directory, output_file=None): """处理目录下所有.doc文件""" results = {} for root, dirs, files in os.walk(directory): for file in files: if file.lower().endswith('.doc') and not file.lower().endswith('.docx'): filepath = os.path.join(root, file) try: result = process_doc_file(filepath) results[file] = result except Exception as e: results[file] = {'error': str(e)} return results def main(): """主函数""" if len(sys.argv) < 2: print("用法: python extract_doc.py <文件或目录路径>") print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc") sys.exit(1) path = sys.argv[1] if os.path.isfile(path): result = process_doc_file(path) print("\n--- OLE解析结果 ---") print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole']) print("\n--- 简单提取结果 ---") print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple']) elif os.path.isdir(path): results = process_directory(path) print(f"\n处理了 {len(results)} 个.doc文件") for filename, result in results.items(): print(f"\n{filename}:") if 'error' in result: print(f" 错误: {result['error']}") else: preview = result['simple'][:200] if result['simple'] else "无内容" print(f" 预览: {preview}...") else: print(f"路径不存在: {path}") sys.exit(1) if __name__ == '__main__': main()