hospital_performance/extract_all_doc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
批量提取参考文档目录中所有 .doc 文件的内容
输出为 JSON 格式，方便后续处理
"""

import os
import sys
import json
import olefile
import re
from pathlib import Path
from datetime import datetime


def extract_text_from_doc(filepath):
    """从 .doc 文件提取文本内容"""
    try:
        ole = olefile.OleFileIO(filepath)

        if not ole.exists('WordDocument'):
            return None, "不是有效的Word文档"

        # 读取WordDocument流
        word_stream = ole.openstream('WordDocument')
        data = word_stream.read()

        # 尝试读取表格流
        table_name = '1Table' if ole.exists('1Table') else '0Table'
        table_data = b''
        if ole.exists(table_name):
            table_stream = ole.openstream(table_name)
            table_data = table_stream.read()

        # 提取文本
        text_parts = []

        # 从WordDocument流提取
        try:
            decoded = data.decode('utf-16-le', errors='ignore')
            # 提取中文和英文文本
            matches = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、""''【】\[\]｛｝{}<>《》\-+×÷=±≡≈≠≤≥∞∩∪∈∉⊂⊃⊆⊇∠⊥∥→←↑↓↔\d]+', decoded)
            for m in matches:
                if len(m.strip()) > 1:
                    text_parts.append(m.strip())
        except:
            pass

        # 从表格流提取
        try:
            decoded = table_data.decode('utf-16-le', errors='ignore')
            matches = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、""''【】\[\]｛｝{}<>《》\-+×÷=±≡≈≠≤≥∞∩∪∈∉⊂⊃⊆⊇∠⊥∥→←↑↓↔\d]+', decoded)
            for m in matches:
                if len(m.strip()) > 1:
                    text_parts.append(m.strip())
        except:
            pass

        # 简单提取（从整个文件）
        with open(filepath, 'rb') as f:
            raw_data = f.read()

        try:
            decoded = raw_data.decode('utf-16-le', errors='ignore')
            matches = re.findall(r'[\u4e00-\u9fff\u0020-\u007e\n\r\t\.,;:!?()（）。，；：！？、""''【】\[\]｛｝{}<>《》\-+×÷=±≡≈≠≤≥∞∩∪∈∉⊂⊃⊆⊇∠⊥∥→←↑↓↔\d]+', decoded)
            for m in matches:
                if len(m.strip()) > 1:
                    text_parts.append(m.strip())
        except:
            pass

        # 去重
        seen = set()
        unique_parts = []
        for part in text_parts:
            if part not in seen and len(part) > 1:
                seen.add(part)
                unique_parts.append(part)

        # 合并文本
        full_text = '\n'.join(unique_parts)

        ole.close()
        return full_text, None

    except Exception as e:
        return None, str(e)


def extract_tables_from_doc(filepath):
    """尝试提取表格内容"""
    try:
        ole = olefile.OleFileIO(filepath)

        tables = []

        # 表格信息存储在表格流中
        # 这是一个简化的方法，实际需要解析复杂的二进制结构

        # 从整个文件中寻找表格特征
        with open(filepath, 'rb') as f:
            data = f.read()

        # 尝试提取表格单元格内容
        decoded = data.decode('utf-16-le', errors='ignore')

        # 寻找表格模式（连续的短文本行）
        lines = [l.strip() for l in decoded.split('\n') if l.strip()]

        # 检测可能的表格行
        table_rows = []
        for line in lines:
            if len(line) > 2 and len(line) < 200:  # 表格单元格通常是短文本
                # 检查是否是表格分隔符
                if not re.match(r'^[\s\-\|]+$', line):
                    table_rows.append(line)

        ole.close()
        return table_rows

    except Exception as e:
        return []


def process_all_doc_files(directory):
    """处理目录下所有 .doc 文件"""
    results = {}

    doc_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
                doc_files.append((file, os.path.join(root, file)))

    print(f"发现 {len(doc_files)} 个 .doc 文件")

    for i, (filename, filepath) in enumerate(doc_files):
        print(f"处理 [{i+1}/{len(doc_files)}]: {filename}")

        text, error = extract_text_from_doc(filepath)
        tables = extract_tables_from_doc(filepath)

        results[filename] = {
            'filepath': filepath,
            'text': text if text else '',
            'tables': tables,
            'error': error
        }

    return results


def main():
    """主函数"""
    # 默认处理参考文档目录
    doc_dir = r"D:\医院绩效系统\参考文档"

    if len(sys.argv) > 1:
        doc_dir = sys.argv[1]

    print(f"开始处理目录: {doc_dir}")
    print("=" * 60)

    # 提取所有 .doc 文件
    results = process_all_doc_files(doc_dir)

    # 保存结果
    output_file = r"D:\医院绩效系统\doc_extracted_content.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n结果已保存到: {output_file}")

    # 统计
    success_count = sum(1 for v in results.values() if v.get('text'))
    error_count = sum(1 for v in results.values() if v.get('error'))

    print(f"成功提取: {success_count} 个文件")
    print(f"提取失败: {error_count} 个文件")

    # 显示部分结果
    print("\n" + "=" * 60)
    print("部分提取结果预览:")
    print("=" * 60)

    for filename, data in list(results.items())[:3]:
        print(f"\n【{filename}】")
        text = data.get('text', '')
        if text:
            print(text[:500] + "..." if len(text) > 500 else text)
        else:
            print(f"  错误: {data.get('error', '未知错误')}")


if __name__ == '__main__':
    main()