115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
import os
|
|
import sys
|
|
import json
|
|
import shutil
|
|
import win32com.client
|
|
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
base_dir = r"D:\医院绩效系统\参考文档"
|
|
temp_dir = r"D:\temp_docs"
|
|
|
|
# Create temp directory
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
# Get actual file names using os.listdir
|
|
files = os.listdir(base_dir)
|
|
|
|
# Find files starting with 01-13
|
|
key_files = []
|
|
for f in sorted(files):
|
|
if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
|
|
f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
|
|
f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
|
|
key_files.append(f)
|
|
|
|
print(f"Key appendix files: {len(key_files)}")
|
|
|
|
# Copy files to temp directory with simple names
|
|
file_mapping = {}
|
|
for i, filename in enumerate(key_files):
|
|
src = os.path.join(base_dir, filename)
|
|
simple_name = f"doc{i+1:02d}.doc"
|
|
dst = os.path.join(temp_dir, simple_name)
|
|
shutil.copy2(src, dst)
|
|
file_mapping[simple_name] = filename
|
|
print(f"Copied: {filename} -> {simple_name}")
|
|
|
|
# Read files using win32com from temp directory
|
|
print("\n\n=== READING FILES ===\n")
|
|
|
|
results = {}
|
|
|
|
word = win32com.client.Dispatch("Word.Application")
|
|
word.Visible = False
|
|
word.DisplayAlerts = False
|
|
|
|
for simple_name, original_name in file_mapping.items():
|
|
filepath = os.path.join(temp_dir, simple_name)
|
|
print(f"\nReading: {original_name}")
|
|
|
|
try:
|
|
doc = word.Documents.Open(filepath, ReadOnly=True)
|
|
text = doc.Content.Text
|
|
tables = []
|
|
|
|
# Extract tables
|
|
for table in doc.Tables:
|
|
table_data = []
|
|
for row in table.Rows:
|
|
row_data = []
|
|
for cell in row.Cells:
|
|
row_data.append(cell.Range.Text.strip())
|
|
if any(row_data):
|
|
table_data.append(row_data)
|
|
if table_data:
|
|
tables.append(table_data)
|
|
|
|
doc.Close()
|
|
|
|
results[original_name] = {
|
|
'text': text,
|
|
'tables': tables
|
|
}
|
|
print(f" Success: {len(text)} chars, {len(tables)} tables")
|
|
except Exception as e:
|
|
results[original_name] = {'error': str(e)}
|
|
print(f" Error: {e}")
|
|
|
|
word.Quit()
|
|
|
|
# Cleanup temp files
|
|
for f in os.listdir(temp_dir):
|
|
os.remove(os.path.join(temp_dir, f))
|
|
os.rmdir(temp_dir)
|
|
|
|
# Save results
|
|
with open(r"D:\医院绩效系统\kpi_extracted.json", "w", encoding="utf-8") as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n\nSaved to kpi_extracted.json")
|
|
|
|
# Print content summary
|
|
print("\n\n=== FILE CONTENTS ===\n")
|
|
for filename, data in results.items():
|
|
print(f"\n{'='*80}")
|
|
print(f"FILE: {filename}")
|
|
print(f"{'='*80}")
|
|
|
|
if isinstance(data, dict):
|
|
if 'error' in data:
|
|
print(f"Error: {data['error']}")
|
|
elif 'text' in data:
|
|
text = data['text']
|
|
print(f"Text content ({len(text)} chars):")
|
|
print(text[:4000])
|
|
|
|
if data.get('tables'):
|
|
print(f"\nTables ({len(data['tables'])}):")
|
|
for i, table in enumerate(data['tables']):
|
|
print(f"\n Table {i+1}:")
|
|
for row in table:
|
|
print(f" {' | '.join(str(c)[:80] for c in row)}")
|
|
else:
|
|
print(data)
|