提交文件
This commit is contained in:
114
extract_kpi.py
Normal file
114
extract_kpi.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import win32com.client
|
||||
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
base_dir = r"D:\医院绩效系统\参考文档"
|
||||
temp_dir = r"D:\temp_docs"
|
||||
|
||||
# Create temp directory
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# Get actual file names using os.listdir
|
||||
files = os.listdir(base_dir)
|
||||
|
||||
# Find files starting with 01-13
|
||||
key_files = []
|
||||
for f in sorted(files):
|
||||
if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
|
||||
f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
|
||||
f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
|
||||
key_files.append(f)
|
||||
|
||||
print(f"Key appendix files: {len(key_files)}")
|
||||
|
||||
# Copy files to temp directory with simple names
|
||||
file_mapping = {}
|
||||
for i, filename in enumerate(key_files):
|
||||
src = os.path.join(base_dir, filename)
|
||||
simple_name = f"doc{i+1:02d}.doc"
|
||||
dst = os.path.join(temp_dir, simple_name)
|
||||
shutil.copy2(src, dst)
|
||||
file_mapping[simple_name] = filename
|
||||
print(f"Copied: {filename} -> {simple_name}")
|
||||
|
||||
# Read files using win32com from temp directory
|
||||
print("\n\n=== READING FILES ===\n")
|
||||
|
||||
results = {}
|
||||
|
||||
word = win32com.client.Dispatch("Word.Application")
|
||||
word.Visible = False
|
||||
word.DisplayAlerts = False
|
||||
|
||||
for simple_name, original_name in file_mapping.items():
|
||||
filepath = os.path.join(temp_dir, simple_name)
|
||||
print(f"\nReading: {original_name}")
|
||||
|
||||
try:
|
||||
doc = word.Documents.Open(filepath, ReadOnly=True)
|
||||
text = doc.Content.Text
|
||||
tables = []
|
||||
|
||||
# Extract tables
|
||||
for table in doc.Tables:
|
||||
table_data = []
|
||||
for row in table.Rows:
|
||||
row_data = []
|
||||
for cell in row.Cells:
|
||||
row_data.append(cell.Range.Text.strip())
|
||||
if any(row_data):
|
||||
table_data.append(row_data)
|
||||
if table_data:
|
||||
tables.append(table_data)
|
||||
|
||||
doc.Close()
|
||||
|
||||
results[original_name] = {
|
||||
'text': text,
|
||||
'tables': tables
|
||||
}
|
||||
print(f" Success: {len(text)} chars, {len(tables)} tables")
|
||||
except Exception as e:
|
||||
results[original_name] = {'error': str(e)}
|
||||
print(f" Error: {e}")
|
||||
|
||||
word.Quit()
|
||||
|
||||
# Cleanup temp files
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
# Save results
|
||||
with open(r"D:\医院绩效系统\kpi_extracted.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n\nSaved to kpi_extracted.json")
|
||||
|
||||
# Print content summary
|
||||
print("\n\n=== FILE CONTENTS ===\n")
|
||||
for filename, data in results.items():
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FILE: {filename}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
if isinstance(data, dict):
|
||||
if 'error' in data:
|
||||
print(f"Error: {data['error']}")
|
||||
elif 'text' in data:
|
||||
text = data['text']
|
||||
print(f"Text content ({len(text)} chars):")
|
||||
print(text[:4000])
|
||||
|
||||
if data.get('tables'):
|
||||
print(f"\nTables ({len(data['tables'])}):")
|
||||
for i, table in enumerate(data['tables']):
|
||||
print(f"\n Table {i+1}:")
|
||||
for row in table:
|
||||
print(f" {' | '.join(str(c)[:80] for c in row)}")
|
||||
else:
|
||||
print(data)
|
||||
Reference in New Issue
Block a user