import os import sys import docx import pdfplumber import json sys.stdout.reconfigure(encoding='utf-8') base_dir = r"D:\医院绩效系统\参考文档" # Get all files all_files = os.listdir(base_dir) print(f"Total files found: {len(all_files)}") # Find files by pattern matching def find_files(patterns): found = [] for f in all_files: for p in patterns: if p in f: found.append(f) break return found # Key patterns to search for key_patterns = [ "附表一", "附表二", "附表三", "附表四", "附表五", "附表六", "附表七", "附表八", "附表九", "附表十", "附表十一", "附表十二", "附表十三", "一票否决", "职能科室公共", "护理部", "院感", "医保", "药学", "手术临床", "非手术", "医疗技术", "医疗辅助", "行政科室", "职工绩效", "KPI" ] key_files = find_files(key_patterns) print(f"\nKey assessment files found: {len(key_files)}") for f in sorted(key_files): print(f" - {f}") def read_docx(filepath): try: doc = docx.Document(filepath) text = [] for para in doc.paragraphs: if para.text.strip(): text.append(para.text.strip()) for table in doc.tables: for row in table.rows: row_text = [] for cell in row.cells: row_text.append(cell.text.strip()) if any(row_text): text.append(" | ".join(row_text)) return "\n".join(text) except Exception as e: return f"Error: {e}" def read_doc(filepath): try: with open(filepath, 'rb') as f: raw = f.read() # Try different encodings for enc in ['utf-8', 'gbk', 'gb2312', 'latin-1']: try: return raw.decode(enc) except: continue return raw.decode('utf-8', errors='ignore') except Exception as e: return f"Error: {e}" def read_pdf(filepath): try: text = [] with pdfplumber.open(filepath) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text.append(page_text) return "\n".join(text) except Exception as e: return f"Error: {e}" # Read and save key files results = {} for filename in sorted(key_files)[:20]: # Limit to first 20 filepath = os.path.join(base_dir, filename) print(f"\nReading: {filename}") if filename.endswith('.docx'): content = read_docx(filepath) elif filename.endswith('.doc'): content = read_doc(filepath) elif filename.endswith('.pdf'): content = read_pdf(filepath) else: continue results[filename] = content print(f"Content length: {len(content)} chars") # Save results with open(r"D:\医院绩效系统\key_content.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n\nSaved {len(results)} files to key_content.json") # Print content for filename, content in results.items(): print(f"\n{'='*80}") print(f"FILE: {filename}") print(f"{'='*80}") preview = content[:4000] if len(content) > 4000 else content print(preview)