hospital_performance/read_key_docs.py

import os
import sys
import docx
import pdfplumber
import json

sys.stdout.reconfigure(encoding='utf-8')

base_dir = r"D:\医院绩效系统\参考文档"

# Get all files
all_files = os.listdir(base_dir)
print(f"Total files found: {len(all_files)}")

# Find files by pattern matching
def find_files(patterns):
    found = []
    for f in all_files:
        for p in patterns:
            if p in f:
                found.append(f)
                break
    return found

# Key patterns to search for
key_patterns = [
    "附表一", "附表二", "附表三", "附表四", "附表五", "附表六",
    "附表七", "附表八", "附表九", "附表十", "附表十一", "附表十二", "附表十三",
    "一票否决", "职能科室公共", "护理部", "院感", "医保", "药学",
    "手术临床", "非手术", "医疗技术", "医疗辅助", "行政科室",
    "职工绩效", "KPI"
]

key_files = find_files(key_patterns)
print(f"\nKey assessment files found: {len(key_files)}")
for f in sorted(key_files):
    print(f"  - {f}")

def read_docx(filepath):
    try:
        doc = docx.Document(filepath)
        text = []
        for para in doc.paragraphs:
            if para.text.strip():
                text.append(para.text.strip())
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    row_text.append(cell.text.strip())
                if any(row_text):
                    text.append(" | ".join(row_text))
        return "\n".join(text)
    except Exception as e:
        return f"Error: {e}"

def read_doc(filepath):
    try:
        with open(filepath, 'rb') as f:
            raw = f.read()
        # Try different encodings
        for enc in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
            try:
                return raw.decode(enc)
            except:
                continue
        return raw.decode('utf-8', errors='ignore')
    except Exception as e:
        return f"Error: {e}"

def read_pdf(filepath):
    try:
        text = []
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
        return "\n".join(text)
    except Exception as e:
        return f"Error: {e}"

# Read and save key files
results = {}
for filename in sorted(key_files)[:20]:  # Limit to first 20
    filepath = os.path.join(base_dir, filename)
    print(f"\nReading: {filename}")
    if filename.endswith('.docx'):
        content = read_docx(filepath)
    elif filename.endswith('.doc'):
        content = read_doc(filepath)
    elif filename.endswith('.pdf'):
        content = read_pdf(filepath)
    else:
        continue
    results[filename] = content
    print(f"Content length: {len(content)} chars")

# Save results
with open(r"D:\医院绩效系统\key_content.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n\nSaved {len(results)} files to key_content.json")

# Print content
for filename, content in results.items():
    print(f"\n{'='*80}")
    print(f"FILE: {filename}")
    print(f"{'='*80}")
    preview = content[:4000] if len(content) > 4000 else content
    print(preview)