hospital_performance/extract_ole.py

import os
import sys
import olefile
import json

sys.stdout.reconfigure(encoding='utf-8')

base_dir = r"D:\医院绩效系统\参考文档"

# Get actual file names from directory
all_files = os.listdir(base_dir)

# Find files starting with 01-13
key_files = []
for f in sorted(all_files):
    if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
       f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
       f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
        key_files.append(f)

print(f"Found {len(key_files)} key files:")
for f in key_files:
    print(f"  - {f}")

results = {}

for filename in key_files:
    filepath = os.path.join(base_dir, filename)
    print(f"\nProcessing: {filename}")

    if not os.path.exists(filepath):
        print(f"  File not found!")
        results[filename] = "File not found"
        continue

    print(f"  File exists, size: {os.path.getsize(filepath)} bytes")

    try:
        ole = olefile.OleFileIO(filepath)

        # List all streams
        streams = ole.listdir()
        print(f"  Streams found: {len(streams)}")
        for s in streams[:10]:
            print(f"    - {'/'.join(s)}")

        # Look for text content in various streams
        text_content = []

        for stream_path in streams:
            stream_name = '/'.join(stream_path)
            try:
                data = ole.openstream(stream_path).read()
                # Try to decode as UTF-16 (common for Word docs)
                try:
                    text = data.decode('utf-16-le', errors='ignore')
                    # Filter out control characters
                    clean_text = ''.join(c for c in text if c.isprintable() or c in '\n\r\t')
                    if clean_text.strip() and len(clean_text.strip()) > 10:
                        text_content.append(f"=== {stream_name} ===\n{clean_text[:1000]}")
                except:
                    pass
            except Exception as e:
                pass

        ole.close()

        if text_content:
            results[filename] = '\n\n'.join(text_content[:20])
            print(f"  Extracted text from {len(text_content)} streams")
        else:
            results[filename] = "No text content found"
            print(f"  No text content found")

    except Exception as e:
        results[filename] = f"Error: {e}"
        print(f"  Error: {e}")

# Save results
with open(r"D:\医院绩效系统\ole_extracted.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n\nSaved to ole_extracted.json")

# Print summary
print("\n\n=== EXTRACTED CONTENT ===\n")
for filename, content in results.items():
    print(f"\n{'='*80}")
    print(f"FILE: {filename}")
    print(f"{'='*80}")
    print(content[:3000] if len(content) > 3000 else content)