提交文件

2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions
--- a/read_docs.py
+++ b/read_docs.py
@@ -0,0 +1,90 @@
+import os
+import sys
+import docx
+import pdfplumber
+import json
+
+# Set console encoding to UTF-8
+sys.stdout.reconfigure(encoding='utf-8')
+
+base_dir = r"D:\医院绩效系统\参考文档"
+
+def read_docx(filepath):
+    """Read .docx file"""
+    try:
+        doc = docx.Document(filepath)
+        text = []
+        for para in doc.paragraphs:
+            if para.text.strip():
+                text.append(para.text.strip())
+        # Also read tables
+        for table in doc.tables:
+            for row in table.rows:
+                row_text = []
+                for cell in row.cells:
+                    row_text.append(cell.text.strip())
+                if any(row_text):
+                    text.append(" | ".join(row_text))
+        return "\n".join(text)
+    except Exception as e:
+        return f"Error reading docx: {e}"
+
+def read_doc(filepath):
+    """Read .doc file (old Word format) using antiword or text extraction"""
+    try:
+        # Try to read as text first (some .doc files are actually text)
+        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+            return f.read()
+    except Exception as e:
+        return f"Error reading doc: {e}"
+
+def read_pdf(filepath):
+    """Read PDF file"""
+    try:
+        text = []
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text.append(page_text)
+        return "\n".join(text)
+    except Exception as e:
+        return f"Error reading pdf: {e}"
+
+def get_all_files(directory):
+    """Get all document files"""
+    files = []
+    for f in os.listdir(directory):
+        if f.endswith('.docx') or f.endswith('.doc') or f.endswith('.pdf'):
+            files.append(f)
+    return sorted(files)
+
+# Main extraction
+results = {}
+for filename in get_all_files(base_dir):
+    filepath = os.path.join(base_dir, filename)
+    print(f"Reading: {filename}")
+    
+    if filename.endswith('.docx'):
+        content = read_docx(filepath)
+    elif filename.endswith('.doc'):
+        content = read_doc(filepath)
+    elif filename.endswith('.pdf'):
+        content = read_pdf(filepath)
+    else:
+        continue
+    
+    results[filename] = content
+
+# Save results
+with open(r"D:\医院绩效系统\extracted_content.json", "w", encoding="utf-8") as f:
+    json.dump(results, f, ensure_ascii=False, indent=2)
+
+print(f"\nExtracted {len(results)} files")
+print("Content saved to extracted_content.json")
+
+# Also print summary of key files
+key_files = [f for f in results.keys() if '附表' in f or '考核' in f or 'KPI' in f]
+print(f"\nKey assessment files found: {len(key_files)}")
+for f in key_files[:20]:
+    print(f"  - {f}")