Files
hospital_performance/extract_kpi.py
2026-02-28 15:16:15 +08:00

115 lines
3.4 KiB
Python

import os
import sys
import json
import shutil
import win32com.client
sys.stdout.reconfigure(encoding='utf-8')
base_dir = r"D:\医院绩效系统\参考文档"
temp_dir = r"D:\temp_docs"
# Create temp directory
os.makedirs(temp_dir, exist_ok=True)
# Get actual file names using os.listdir
files = os.listdir(base_dir)
# Find files starting with 01-13
key_files = []
for f in sorted(files):
if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
key_files.append(f)
print(f"Key appendix files: {len(key_files)}")
# Copy files to temp directory with simple names
file_mapping = {}
for i, filename in enumerate(key_files):
src = os.path.join(base_dir, filename)
simple_name = f"doc{i+1:02d}.doc"
dst = os.path.join(temp_dir, simple_name)
shutil.copy2(src, dst)
file_mapping[simple_name] = filename
print(f"Copied: {filename} -> {simple_name}")
# Read files using win32com from temp directory
print("\n\n=== READING FILES ===\n")
results = {}
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
word.DisplayAlerts = False
for simple_name, original_name in file_mapping.items():
filepath = os.path.join(temp_dir, simple_name)
print(f"\nReading: {original_name}")
try:
doc = word.Documents.Open(filepath, ReadOnly=True)
text = doc.Content.Text
tables = []
# Extract tables
for table in doc.Tables:
table_data = []
for row in table.Rows:
row_data = []
for cell in row.Cells:
row_data.append(cell.Range.Text.strip())
if any(row_data):
table_data.append(row_data)
if table_data:
tables.append(table_data)
doc.Close()
results[original_name] = {
'text': text,
'tables': tables
}
print(f" Success: {len(text)} chars, {len(tables)} tables")
except Exception as e:
results[original_name] = {'error': str(e)}
print(f" Error: {e}")
word.Quit()
# Cleanup temp files
for f in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, f))
os.rmdir(temp_dir)
# Save results
with open(r"D:\医院绩效系统\kpi_extracted.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n\nSaved to kpi_extracted.json")
# Print content summary
print("\n\n=== FILE CONTENTS ===\n")
for filename, data in results.items():
print(f"\n{'='*80}")
print(f"FILE: {filename}")
print(f"{'='*80}")
if isinstance(data, dict):
if 'error' in data:
print(f"Error: {data['error']}")
elif 'text' in data:
text = data['text']
print(f"Text content ({len(text)} chars):")
print(text[:4000])
if data.get('tables'):
print(f"\nTables ({len(data['tables'])}):")
for i, table in enumerate(data['tables']):
print(f"\n Table {i+1}:")
for row in table:
print(f" {' | '.join(str(c)[:80] for c in row)}")
else:
print(data)