hospital_performance/read_docs.py

import os
import sys
import docx
import pdfplumber
import json

# Set console encoding to UTF-8
sys.stdout.reconfigure(encoding='utf-8')

base_dir = r"D:\医院绩效系统\参考文档"

def read_docx(filepath):
    """Read .docx file"""
    try:
        doc = docx.Document(filepath)
        text = []
        for para in doc.paragraphs:
            if para.text.strip():
                text.append(para.text.strip())
        # Also read tables
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    row_text.append(cell.text.strip())
                if any(row_text):
                    text.append(" | ".join(row_text))
        return "\n".join(text)
    except Exception as e:
        return f"Error reading docx: {e}"

def read_doc(filepath):
    """Read .doc file (old Word format) using antiword or text extraction"""
    try:
        # Try to read as text first (some .doc files are actually text)
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()
    except Exception as e:
        return f"Error reading doc: {e}"

def read_pdf(filepath):
    """Read PDF file"""
    try:
        text = []
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
        return "\n".join(text)
    except Exception as e:
        return f"Error reading pdf: {e}"

def get_all_files(directory):
    """Get all document files"""
    files = []
    for f in os.listdir(directory):
        if f.endswith('.docx') or f.endswith('.doc') or f.endswith('.pdf'):
            files.append(f)
    return sorted(files)

# Main extraction
results = {}
for filename in get_all_files(base_dir):
    filepath = os.path.join(base_dir, filename)
    print(f"Reading: {filename}")

    if filename.endswith('.docx'):
        content = read_docx(filepath)
    elif filename.endswith('.doc'):
        content = read_doc(filepath)
    elif filename.endswith('.pdf'):
        content = read_pdf(filepath)
    else:
        continue

    results[filename] = content

# Save results
with open(r"D:\医院绩效系统\extracted_content.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nExtracted {len(results)} files")
print("Content saved to extracted_content.json")

# Also print summary of key files
key_files = [f for f in results.keys() if '附表' in f or '考核' in f or 'KPI' in f]
print(f"\nKey assessment files found: {len(key_files)}")
for f in key_files[:20]:
    print(f"  - {f}")