提交文件
This commit is contained in:
87
analyze_docx.py
Normal file
87
analyze_docx.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
import sys
|
||||
import docx
|
||||
import json
|
||||
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
base_dir = r"D:\医院绩效系统\参考文档"
|
||||
|
||||
def read_docx(filepath):
|
||||
"""Read .docx file and extract text and tables"""
|
||||
try:
|
||||
doc = docx.Document(filepath)
|
||||
result = {
|
||||
'paragraphs': [],
|
||||
'tables': []
|
||||
}
|
||||
|
||||
# Extract paragraphs
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
result['paragraphs'].append(para.text.strip())
|
||||
|
||||
# Extract tables
|
||||
for table in doc.tables:
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
row_data.append(cell.text.strip())
|
||||
if any(row_data):
|
||||
table_data.append(row_data)
|
||||
if table_data:
|
||||
result['tables'].append(table_data)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
# Get all docx files
|
||||
docx_files = [f for f in os.listdir(base_dir) if f.endswith('.docx')]
|
||||
print(f"Found {len(docx_files)} .docx files\n")
|
||||
|
||||
# Read and analyze each file
|
||||
all_content = {}
|
||||
for filename in sorted(docx_files):
|
||||
filepath = os.path.join(base_dir, filename)
|
||||
print(f"Reading: {filename}")
|
||||
content = read_docx(filepath)
|
||||
all_content[filename] = content
|
||||
|
||||
# Print summary
|
||||
print(f" Paragraphs: {len(content.get('paragraphs', []))}")
|
||||
print(f" Tables: {len(content.get('tables', []))}")
|
||||
if content.get('tables'):
|
||||
for i, table in enumerate(content['tables']):
|
||||
print(f" Table {i+1}: {len(table)} rows x {len(table[0]) if table else 0} cols")
|
||||
|
||||
# Save to JSON
|
||||
with open(r"D:\医院绩效系统\docx_content.json", "w", encoding="utf-8") as f:
|
||||
json.dump(all_content, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nSaved content to docx_content.json")
|
||||
|
||||
# Print detailed content for key assessment files
|
||||
key_files = [f for f in docx_files if any(k in f for k in ['考核', '评分', '职能'])]
|
||||
print(f"\n\n=== DETAILED CONTENT FOR KEY ASSESSMENT FILES ===\n")
|
||||
|
||||
for filename in sorted(key_files):
|
||||
content = all_content.get(filename, {})
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FILE: {filename}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Print paragraphs
|
||||
if content.get('paragraphs'):
|
||||
print("\n--- Paragraphs ---")
|
||||
for p in content['paragraphs'][:20]:
|
||||
print(p)
|
||||
|
||||
# Print tables
|
||||
if content.get('tables'):
|
||||
print("\n--- Tables ---")
|
||||
for i, table in enumerate(content['tables']):
|
||||
print(f"\nTable {i+1}:")
|
||||
for row in table:
|
||||
print(" | ".join(str(cell) for cell in row))
|
||||
Reference in New Issue
Block a user