91 lines
2.7 KiB
Python
91 lines
2.7 KiB
Python
import os
|
|
import sys
|
|
import docx
|
|
import pdfplumber
|
|
import json
|
|
|
|
# Set console encoding to UTF-8
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
base_dir = r"D:\医院绩效系统\参考文档"
|
|
|
|
def read_docx(filepath):
|
|
"""Read .docx file"""
|
|
try:
|
|
doc = docx.Document(filepath)
|
|
text = []
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
text.append(para.text.strip())
|
|
# Also read tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = []
|
|
for cell in row.cells:
|
|
row_text.append(cell.text.strip())
|
|
if any(row_text):
|
|
text.append(" | ".join(row_text))
|
|
return "\n".join(text)
|
|
except Exception as e:
|
|
return f"Error reading docx: {e}"
|
|
|
|
def read_doc(filepath):
|
|
"""Read .doc file (old Word format) using antiword or text extraction"""
|
|
try:
|
|
# Try to read as text first (some .doc files are actually text)
|
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
|
return f.read()
|
|
except Exception as e:
|
|
return f"Error reading doc: {e}"
|
|
|
|
def read_pdf(filepath):
|
|
"""Read PDF file"""
|
|
try:
|
|
text = []
|
|
with pdfplumber.open(filepath) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text.append(page_text)
|
|
return "\n".join(text)
|
|
except Exception as e:
|
|
return f"Error reading pdf: {e}"
|
|
|
|
def get_all_files(directory):
|
|
"""Get all document files"""
|
|
files = []
|
|
for f in os.listdir(directory):
|
|
if f.endswith('.docx') or f.endswith('.doc') or f.endswith('.pdf'):
|
|
files.append(f)
|
|
return sorted(files)
|
|
|
|
# Main extraction
|
|
results = {}
|
|
for filename in get_all_files(base_dir):
|
|
filepath = os.path.join(base_dir, filename)
|
|
print(f"Reading: {filename}")
|
|
|
|
if filename.endswith('.docx'):
|
|
content = read_docx(filepath)
|
|
elif filename.endswith('.doc'):
|
|
content = read_doc(filepath)
|
|
elif filename.endswith('.pdf'):
|
|
content = read_pdf(filepath)
|
|
else:
|
|
continue
|
|
|
|
results[filename] = content
|
|
|
|
# Save results
|
|
with open(r"D:\医院绩效系统\extracted_content.json", "w", encoding="utf-8") as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nExtracted {len(results)} files")
|
|
print("Content saved to extracted_content.json")
|
|
|
|
# Also print summary of key files
|
|
key_files = [f for f in results.keys() if '附表' in f or '考核' in f or 'KPI' in f]
|
|
print(f"\nKey assessment files found: {len(key_files)}")
|
|
for f in key_files[:20]:
|
|
print(f" - {f}")
|