import os import sys import docx import json sys.stdout.reconfigure(encoding='utf-8') base_dir = r"D:\医院绩效系统\参考文档" def read_docx(filepath): """Read .docx file and extract text and tables""" try: doc = docx.Document(filepath) result = { 'paragraphs': [], 'tables': [] } # Extract paragraphs for para in doc.paragraphs: if para.text.strip(): result['paragraphs'].append(para.text.strip()) # Extract tables for table in doc.tables: table_data = [] for row in table.rows: row_data = [] for cell in row.cells: row_data.append(cell.text.strip()) if any(row_data): table_data.append(row_data) if table_data: result['tables'].append(table_data) return result except Exception as e: return {'error': str(e)} # Get all docx files docx_files = [f for f in os.listdir(base_dir) if f.endswith('.docx')] print(f"Found {len(docx_files)} .docx files\n") # Read and analyze each file all_content = {} for filename in sorted(docx_files): filepath = os.path.join(base_dir, filename) print(f"Reading: {filename}") content = read_docx(filepath) all_content[filename] = content # Print summary print(f" Paragraphs: {len(content.get('paragraphs', []))}") print(f" Tables: {len(content.get('tables', []))}") if content.get('tables'): for i, table in enumerate(content['tables']): print(f" Table {i+1}: {len(table)} rows x {len(table[0]) if table else 0} cols") # Save to JSON with open(r"D:\医院绩效系统\docx_content.json", "w", encoding="utf-8") as f: json.dump(all_content, f, ensure_ascii=False, indent=2) print(f"\nSaved content to docx_content.json") # Print detailed content for key assessment files key_files = [f for f in docx_files if any(k in f for k in ['考核', '评分', '职能'])] print(f"\n\n=== DETAILED CONTENT FOR KEY ASSESSMENT FILES ===\n") for filename in sorted(key_files): content = all_content.get(filename, {}) print(f"\n{'='*80}") print(f"FILE: {filename}") print(f"{'='*80}") # Print paragraphs if content.get('paragraphs'): print("\n--- Paragraphs ---") for p in content['paragraphs'][:20]: print(p) # Print tables if content.get('tables'): print("\n--- Tables ---") for i, table in enumerate(content['tables']): print(f"\nTable {i+1}:") for row in table: print(" | ".join(str(cell) for cell in row))