import os import sys import docx import pdfplumber import json # Set console encoding to UTF-8 sys.stdout.reconfigure(encoding='utf-8') base_dir = r"D:\医院绩效系统\参考文档" def read_docx(filepath): """Read .docx file""" try: doc = docx.Document(filepath) text = [] for para in doc.paragraphs: if para.text.strip(): text.append(para.text.strip()) # Also read tables for table in doc.tables: for row in table.rows: row_text = [] for cell in row.cells: row_text.append(cell.text.strip()) if any(row_text): text.append(" | ".join(row_text)) return "\n".join(text) except Exception as e: return f"Error reading docx: {e}" def read_doc(filepath): """Read .doc file (old Word format) using antiword or text extraction""" try: # Try to read as text first (some .doc files are actually text) with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: return f.read() except Exception as e: return f"Error reading doc: {e}" def read_pdf(filepath): """Read PDF file""" try: text = [] with pdfplumber.open(filepath) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text.append(page_text) return "\n".join(text) except Exception as e: return f"Error reading pdf: {e}" def get_all_files(directory): """Get all document files""" files = [] for f in os.listdir(directory): if f.endswith('.docx') or f.endswith('.doc') or f.endswith('.pdf'): files.append(f) return sorted(files) # Main extraction results = {} for filename in get_all_files(base_dir): filepath = os.path.join(base_dir, filename) print(f"Reading: {filename}") if filename.endswith('.docx'): content = read_docx(filepath) elif filename.endswith('.doc'): content = read_doc(filepath) elif filename.endswith('.pdf'): content = read_pdf(filepath) else: continue results[filename] = content # Save results with open(r"D:\医院绩效系统\extracted_content.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nExtracted {len(results)} files") print("Content saved to extracted_content.json") # Also print summary of key files key_files = [f for f in results.keys() if '附表' in f or '考核' in f or 'KPI' in f] print(f"\nKey assessment files found: {len(key_files)}") for f in key_files[:20]: print(f" - {f}")