hospital_performance/extract_doc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
提取 .doc (Word 97-2003) 文件内容的脚本
支持多种方法：OLE流提取、结构化解析
"""

import os
import sys
import olefile
import re
from pathlib import Path


def extract_text_from_ole(filepath):
    """从OLE文件中提取文本流"""
    try:
        ole = olefile.OleFileIO(filepath)
        # 检查是否是Word文档
        if ole.exists('WordDocument'):
            # 尝试读取主文档流
            word_stream = ole.openstream('WordDocument')
            data = word_stream.read()

            # 提取文本 - 这是一个简化的方法
            # 实际的.doc文件结构非常复杂
            text = extract_text_from_fib(data, ole)
            return text
        else:
            return "不是有效的Word文档"
    except Exception as e:
        return f"错误: {str(e)}"


def extract_text_from_fib(data, ole):
    """从FIB(文件信息块)中提取文本"""
    try:
        # 读取FIB头
        # Word 97-2003 的FIB结构
        # 偏移量24-26: ccpText (主文档文本字符数)
        # 偏移量26-28: ccpFtn (脚注文本字符数)

        # 获取文本位置表
        # 这是一个简化的提取方法

        # 尝试读取1Table或0Table流
        table_name = '1Table' if ole.exists('1Table') else '0Table'

        if ole.exists(table_name):
            table_stream = ole.openstream(table_name)
            table_data = table_stream.read()

            # PLC (Piece Descriptor) 结构解析
            # 从表格中提取文本位置信息

            # 简单方法：提取所有可打印的Unicode文本
            text_parts = []

            # 从WordDocument流中提取文本
            # 使用正则表达式匹配可能的文本
            try:
                # 尝试UTF-16解码
                decoded = data.decode('utf-16-le', errors='ignore')
                # 过滤可打印字符
                printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
                if printable.strip():
                    text_parts.append(printable)
            except:
                pass

            # 从表格流中提取
            try:
                decoded = table_data.decode('utf-16-le', errors='ignore')
                printable = re.sub(r'[^\u4e00-\u9fff\u0020-\u007e\n\r\t]', '', decoded)
                if printable.strip():
                    text_parts.append(printable)
            except:
                pass

            return '\n'.join(text_parts) if text_parts else "无法提取文本内容"

        return "无法找到表格流"

    except Exception as e:
        return f"解析错误: {str(e)}"


def extract_text_simple(filepath):
    """简单方法：提取所有可读文本"""
    try:
        with open(filepath, 'rb') as f:
            data = f.read()

        # 尝试多种编码
        text_parts = []

        # UTF-16 LE (Windows Unicode)
        try:
            decoded = data.decode('utf-16-le', errors='ignore')
            # 中文字符范围：\u4e00-\u9fff
            # 英文和标点：\u0020-\u007e
            chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、]+', decoded)
            if chinese:
                text_parts.extend([c for c in chinese if len(c.strip()) > 1])
        except:
            pass

        # GB2312/GBK
        try:
            decoded = data.decode('gbk', errors='ignore')
            chinese = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、]+', decoded)
            if chinese:
                text_parts.extend([c for c in chinese if len(c.strip()) > 1])
        except:
            pass

        # 去重并保持顺序
        seen = set()
        unique_parts = []
        for part in text_parts:
            if part not in seen:
                seen.add(part)
                unique_parts.append(part)

        return '\n'.join(unique_parts) if unique_parts else "无法提取文本"

    except Exception as e:
        return f"错误: {str(e)}"


def process_doc_file(filepath):
    """处理单个.doc文件"""
    print(f"\n处理文件: {filepath}")
    print("=" * 50)

    # 方法1: OLE解析
    text_ole = extract_text_from_ole(filepath)

    # 方法2: 简单提取
    text_simple = extract_text_simple(filepath)

    return {
        'ole': text_ole,
        'simple': text_simple
    }


def process_directory(directory, output_file=None):
    """处理目录下所有.doc文件"""
    results = {}

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
                filepath = os.path.join(root, file)
                try:
                    result = process_doc_file(filepath)
                    results[file] = result
                except Exception as e:
                    results[file] = {'error': str(e)}

    return results


def main():
    """主函数"""
    if len(sys.argv) < 2:
        print("用法: python extract_doc.py <文件或目录路径>")
        print("示例: python extract_doc.py 参考文档\\15.XXX医院绩效方案.doc")
        sys.exit(1)

    path = sys.argv[1]

    if os.path.isfile(path):
        result = process_doc_file(path)
        print("\n--- OLE解析结果 ---")
        print(result['ole'][:2000] if len(result['ole']) > 2000 else result['ole'])
        print("\n--- 简单提取结果 ---")
        print(result['simple'][:2000] if len(result['simple']) > 2000 else result['simple'])

    elif os.path.isdir(path):
        results = process_directory(path)
        print(f"\n处理了 {len(results)} 个.doc文件")

        for filename, result in results.items():
            print(f"\n{filename}:")
            if 'error' in result:
                print(f"  错误: {result['error']}")
            else:
                preview = result['simple'][:200] if result['simple'] else "无内容"
                print(f"  预览: {preview}...")
    else:
        print(f"路径不存在: {path}")
        sys.exit(1)


if __name__ == '__main__':
    main()