提交文件
This commit is contained in:
111
read_key_docs.py
Normal file
111
read_key_docs.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import os
|
||||
import sys
|
||||
import docx
|
||||
import pdfplumber
|
||||
import json
|
||||
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
base_dir = r"D:\医院绩效系统\参考文档"
|
||||
|
||||
# Get all files
|
||||
all_files = os.listdir(base_dir)
|
||||
print(f"Total files found: {len(all_files)}")
|
||||
|
||||
# Find files by pattern matching
|
||||
def find_files(patterns):
|
||||
found = []
|
||||
for f in all_files:
|
||||
for p in patterns:
|
||||
if p in f:
|
||||
found.append(f)
|
||||
break
|
||||
return found
|
||||
|
||||
# Key patterns to search for
|
||||
key_patterns = [
|
||||
"附表一", "附表二", "附表三", "附表四", "附表五", "附表六",
|
||||
"附表七", "附表八", "附表九", "附表十", "附表十一", "附表十二", "附表十三",
|
||||
"一票否决", "职能科室公共", "护理部", "院感", "医保", "药学",
|
||||
"手术临床", "非手术", "医疗技术", "医疗辅助", "行政科室",
|
||||
"职工绩效", "KPI"
|
||||
]
|
||||
|
||||
key_files = find_files(key_patterns)
|
||||
print(f"\nKey assessment files found: {len(key_files)}")
|
||||
for f in sorted(key_files):
|
||||
print(f" - {f}")
|
||||
|
||||
def read_docx(filepath):
|
||||
try:
|
||||
doc = docx.Document(filepath)
|
||||
text = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text.append(para.text.strip())
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = []
|
||||
for cell in row.cells:
|
||||
row_text.append(cell.text.strip())
|
||||
if any(row_text):
|
||||
text.append(" | ".join(row_text))
|
||||
return "\n".join(text)
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
def read_doc(filepath):
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
raw = f.read()
|
||||
# Try different encodings
|
||||
for enc in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
|
||||
try:
|
||||
return raw.decode(enc)
|
||||
except:
|
||||
continue
|
||||
return raw.decode('utf-8', errors='ignore')
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
def read_pdf(filepath):
|
||||
try:
|
||||
text = []
|
||||
with pdfplumber.open(filepath) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text.append(page_text)
|
||||
return "\n".join(text)
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
# Read and save key files
|
||||
results = {}
|
||||
for filename in sorted(key_files)[:20]: # Limit to first 20
|
||||
filepath = os.path.join(base_dir, filename)
|
||||
print(f"\nReading: {filename}")
|
||||
if filename.endswith('.docx'):
|
||||
content = read_docx(filepath)
|
||||
elif filename.endswith('.doc'):
|
||||
content = read_doc(filepath)
|
||||
elif filename.endswith('.pdf'):
|
||||
content = read_pdf(filepath)
|
||||
else:
|
||||
continue
|
||||
results[filename] = content
|
||||
print(f"Content length: {len(content)} chars")
|
||||
|
||||
# Save results
|
||||
with open(r"D:\医院绩效系统\key_content.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n\nSaved {len(results)} files to key_content.json")
|
||||
|
||||
# Print content
|
||||
for filename, content in results.items():
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FILE: {filename}")
|
||||
print(f"{'='*80}")
|
||||
preview = content[:4000] if len(content) > 4000 else content
|
||||
print(preview)
|
||||
Reference in New Issue
Block a user