提交文件

2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions
--- a/extract_doc.py
+++ b/extract_doc.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+提取 .doc (Word 97-2003) 文件内容的脚本
+支持多种方法：OLE流提取、结构化解析
+"""
+
+import os
+import sys
+import olefile
+import re
+from pathlib import Path
+
+
+def extract_text_from_ole(filepath):
+    """从OLE文件中提取文本流"""
+    try:
+        ole = olefile.OleFileIO(filepath)
+        # 检查是否是Word文档
+        if ole.exists('WordDocument'):
+            # 尝试读取主文档流
+            word_stream = ole.openstream('WordDocument')
+            data = word_stream.read()
+            
+            # 提取文本 - 这是一个简化的方法
+            # 实际的.doc文件结构非常复杂
+            text = extract_text_from_fib(data, ole)
+            return text
+        else:
+            return "不是有效的Word文档"
+    except Exception as e:
+        return f"错误: {str(e)}"
+
+
+def extract_text_from_fib(data, ole):
+    """从FIB(文件信息块)中提取文本"""
+    try:
+        # 读取FIB头
+        # Word 97-2003 的FIB结构
+        # 偏移量24-26: ccpText (主文档文本字符数)
+        # 偏移量26-28: ccpFtn (脚注文本字符数)
+        
+        # 获取文本位置表
+        # 这是一个简化的提取方法
+        
+        # 尝试读取1Table或0Table流
+        table_name = '1Table' if ole.exists('1Table') else '0Table'
+        
+        if ole.exists(table_name):
+            table_stream = ole.openstream(table_name)
+            table_data = table_stream.read()
+            
+            # PLC (Piece Descriptor) 结构解析
+            # 从表格中提取文本位置信息
+            
+            # 简单方法：提取所有可打印的Unicode文本
+            text_parts = []
+            
+            # 从WordDocument流中提取文本
+            # 使用正则表达式匹配可能的文本
+            try:
+                # 尝试UTF-16解码
+                decoded = data.decode('utf-16-le', errors='ignore')
+                # 过滤可打印字符
+                printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
+                if printable.strip():
+                    text_parts.append(printable)
+            except:
+                pass
+            
+            # 从表格流中提取
+            try:
+                decoded = table_data.decode('utf-16-le', errors='ignore')
+                printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
+                if printable.strip():
+                    text_parts.append(printable)
+            except:
+                pass
+            
+            return '\n'.join(text_parts) if text_parts else "无法提取文本内容"
+        
+        return "无法找到表格流"
+        
+    except Exception as e:
+        return f"解析错误: {str(e)}"
+
+
+def extract_text_simple(filepath):
+    """简单方法：提取所有可读文本"""
+    try:
+        with open(filepath, 'rb') as f:
+            data = f.read()
+        
+        # 尝试多种编码
+        text_parts = []
+        
+        # UTF-16 LE (Windows Unicode)
+        try:
+            decoded = data.decode('utf-16-le', errors='ignore')
+            # 中文字符范围：\u4e00-\u9fff
+            # 英文和标点：\u0020-\u007e
+            chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、]+', decoded)
+            if chinese:
+                text_parts.extend([c for c in chinese if len(c.strip()) > 1])
+        except:
+            pass
+        
+        # GB2312/GBK
+        try:
+            decoded = data.decode('gbk', errors='ignore')
+            chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、]+', decoded)
+            if chinese:
+                text_parts.extend([c for c in chinese if len(c.strip()) > 1])
+        except:
+            pass
+        
+        # 去重并保持顺序
+        seen = set()
+        unique_parts = []
+        for part in text_parts:
+            if part not in seen:
+                seen.add(part)
+                unique_parts.append(part)
+        
+        return '\n'.join(unique_parts) if unique_parts else "无法提取文本"
+        
+    except Exception as e:
+        return f"错误: {str(e)}"
+
+
+def process_doc_file(filepath):
+    """处理单个.doc文件"""
+    print(f"\n处理文件: {filepath}")
+    print("=" * 50)
+    
+    # 方法1: OLE解析
+    text_ole = extract_text_from_ole(filepath)
+    
+    # 方法2: 简单提取
+    text_simple = extract_text_simple(filepath)
+    
+    return {
+        'ole': text_ole,
+        'simple': text_simple
+    }
+
+
+def process_directory(directory, output_file=None):
+    """处理目录下所有.doc文件"""
+    results = {}
+    
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
+                filepath = os.path.join(root, file)
+                try:
+                    result = process_doc_file(filepath)
+                    results[file] = result
+                except Exception as e:
+                    results[file] = {'error': str(e)}
+    
+    return results
+
+
+def main():
+    """主函数"""
+    if len(sys.argv) < 2:
+        print("用法: python extract_doc.py <文件或目录路径>")
+        print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
+        sys.exit(1)
+    
+    path = sys.argv[1]
+    
+    if os.path.isfile(path):
+        result = process_doc_file(path)
+        print("\n--- OLE解析结果 ---")
+        print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
+        print("\n--- 简单提取结果 ---")
+        print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])
+        
+    elif os.path.isdir(path):
+        results = process_directory(path)
+        print(f"\n处理了 {len(results)} 个.doc文件")
+        
+        for filename, result in results.items():
+            print(f"\n{filename}:")
+            if 'error' in result:
+                print(f"  错误: {result['error']}")
+            else:
+                preview = result['simple'][:200] if result['simple'] else "无内容"
+                print(f"  预览: {preview}...")
+    else:
+        print(f"路径不存在: {path}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()