提交文件

2026-02-28 15:16:15 +08:00
parent 1a4e50e0a4
commit 44f250f58e
159 changed files with 61268 additions and 0 deletions
--- a/extract_ole.py
+++ b/extract_ole.py
@@ -0,0 +1,91 @@
+import os
+import sys
+import olefile
+import json
+
+sys.stdout.reconfigure(encoding='utf-8')
+
+base_dir = r"D:\医院绩效系统\参考文档"
+
+# Get actual file names from directory
+all_files = os.listdir(base_dir)
+
+# Find files starting with 01-13
+key_files = []
+for f in sorted(all_files):
+    if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
+       f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
+       f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
+        key_files.append(f)
+
+print(f"Found {len(key_files)} key files:")
+for f in key_files:
+    print(f"  - {f}")
+
+results = {}
+
+for filename in key_files:
+    filepath = os.path.join(base_dir, filename)
+    print(f"\nProcessing: {filename}")
+    
+    if not os.path.exists(filepath):
+        print(f"  File not found!")
+        results[filename] = "File not found"
+        continue
+    
+    print(f"  File exists, size: {os.path.getsize(filepath)} bytes")
+    
+    try:
+        ole = olefile.OleFileIO(filepath)
+        
+        # List all streams
+        streams = ole.listdir()
+        print(f"  Streams found: {len(streams)}")
+        for s in streams[:10]:
+            print(f"    - {'/'.join(s)}")
+        
+        # Look for text content in various streams
+        text_content = []
+        
+        for stream_path in streams:
+            stream_name = '/'.join(stream_path)
+            try:
+                data = ole.openstream(stream_path).read()
+                # Try to decode as UTF-16 (common for Word docs)
+                try:
+                    text = data.decode('utf-16-le', errors='ignore')
+                    # Filter out control characters
+                    clean_text = ''.join(c for c in text if c.isprintable() or c in '\n\r\t')
+                    if clean_text.strip() and len(clean_text.strip()) > 10:
+                        text_content.append(f"=== {stream_name} ===\n{clean_text[:1000]}")
+                except:
+                    pass
+            except Exception as e:
+                pass
+        
+        ole.close()
+        
+        if text_content:
+            results[filename] = '\n\n'.join(text_content[:20])
+            print(f"  Extracted text from {len(text_content)} streams")
+        else:
+            results[filename] = "No text content found"
+            print(f"  No text content found")
+            
+    except Exception as e:
+        results[filename] = f"Error: {e}"
+        print(f"  Error: {e}")
+
+# Save results
+with open(r"D:\医院绩效系统\ole_extracted.json", "w", encoding="utf-8") as f:
+    json.dump(results, f, ensure_ascii=False, indent=2)
+
+print(f"\n\nSaved to ole_extracted.json")
+
+# Print summary
+print("\n\n=== EXTRACTED CONTENT ===\n")
+for filename, content in results.items():
+    print(f"\n{'='*80}")
+    print(f"FILE: {filename}")
+    print(f"{'='*80}")
+    print(content[:3000] if len(content) > 3000 else content)