hospital_performance/analyze_docx.py

import os
import sys
import docx
import json

sys.stdout.reconfigure(encoding='utf-8')

base_dir = r"D:\医院绩效系统\参考文档"

def read_docx(filepath):
    """Read .docx file and extract text and tables"""
    try:
        doc = docx.Document(filepath)
        result = {
            'paragraphs': [],
            'tables': []
        }

        # Extract paragraphs
        for para in doc.paragraphs:
            if para.text.strip():
                result['paragraphs'].append(para.text.strip())

        # Extract tables
        for table in doc.tables:
            table_data = []
            for row in table.rows:
                row_data = []
                for cell in row.cells:
                    row_data.append(cell.text.strip())
                if any(row_data):
                    table_data.append(row_data)
            if table_data:
                result['tables'].append(table_data)

        return result
    except Exception as e:
        return {'error': str(e)}

# Get all docx files
docx_files = [f for f in os.listdir(base_dir) if f.endswith('.docx')]
print(f"Found {len(docx_files)} .docx files\n")

# Read and analyze each file
all_content = {}
for filename in sorted(docx_files):
    filepath = os.path.join(base_dir, filename)
    print(f"Reading: {filename}")
    content = read_docx(filepath)
    all_content[filename] = content

    # Print summary
    print(f"  Paragraphs: {len(content.get('paragraphs', []))}")
    print(f"  Tables: {len(content.get('tables', []))}")
    if content.get('tables'):
        for i, table in enumerate(content['tables']):
            print(f"    Table {i+1}: {len(table)} rows x {len(table[0]) if table else 0} cols")

# Save to JSON
with open(r"D:\医院绩效系统\docx_content.json", "w", encoding="utf-8") as f:
    json.dump(all_content, f, ensure_ascii=False, indent=2)

print(f"\nSaved content to docx_content.json")

# Print detailed content for key assessment files
key_files = [f for f in docx_files if any(k in f for k in ['考核', '评分', '职能'])]
print(f"\n\n=== DETAILED CONTENT FOR KEY ASSESSMENT FILES ===\n")

for filename in sorted(key_files):
    content = all_content.get(filename, {})
    print(f"\n{'='*80}")
    print(f"FILE: {filename}")
    print(f"{'='*80}")

    # Print paragraphs
    if content.get('paragraphs'):
        print("\n--- Paragraphs ---")
        for p in content['paragraphs'][:20]:
            print(p)

    # Print tables
    if content.get('tables'):
        print("\n--- Tables ---")
        for i, table in enumerate(content['tables']):
            print(f"\nTable {i+1}:")
            for row in table:
                print(" | ".join(str(cell) for cell in row))