import os import sys import olefile import json sys.stdout.reconfigure(encoding='utf-8') base_dir = r"D:\医院绩效系统\参考文档" # Get actual file names from directory all_files = os.listdir(base_dir) # Find files starting with 01-13 key_files = [] for f in sorted(all_files): if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \ f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \ f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'): key_files.append(f) print(f"Found {len(key_files)} key files:") for f in key_files: print(f" - {f}") results = {} for filename in key_files: filepath = os.path.join(base_dir, filename) print(f"\nProcessing: {filename}") if not os.path.exists(filepath): print(f" File not found!") results[filename] = "File not found" continue print(f" File exists, size: {os.path.getsize(filepath)} bytes") try: ole = olefile.OleFileIO(filepath) # List all streams streams = ole.listdir() print(f" Streams found: {len(streams)}") for s in streams[:10]: print(f" - {'/'.join(s)}") # Look for text content in various streams text_content = [] for stream_path in streams: stream_name = '/'.join(stream_path) try: data = ole.openstream(stream_path).read() # Try to decode as UTF-16 (common for Word docs) try: text = data.decode('utf-16-le', errors='ignore') # Filter out control characters clean_text = ''.join(c for c in text if c.isprintable() or c in '\n\r\t') if clean_text.strip() and len(clean_text.strip()) > 10: text_content.append(f"=== {stream_name} ===\n{clean_text[:1000]}") except: pass except Exception as e: pass ole.close() if text_content: results[filename] = '\n\n'.join(text_content[:20]) print(f" Extracted text from {len(text_content)} streams") else: results[filename] = "No text content found" print(f" No text content found") except Exception as e: results[filename] = f"Error: {e}" print(f" Error: {e}") # Save results with open(r"D:\医院绩效系统\ole_extracted.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n\nSaved to ole_extracted.json") # Print summary print("\n\n=== EXTRACTED CONTENT ===\n") for filename, content in results.items(): print(f"\n{'='*80}") print(f"FILE: {filename}") print(f"{'='*80}") print(content[:3000] if len(content) > 3000 else content)