提交文件
This commit is contained in:
91
extract_ole.py
Normal file
91
extract_ole.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import os
|
||||
import sys
|
||||
import olefile
|
||||
import json
|
||||
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
base_dir = r"D:\医院绩效系统\参考文档"
|
||||
|
||||
# Get actual file names from directory
|
||||
all_files = os.listdir(base_dir)
|
||||
|
||||
# Find files starting with 01-13
|
||||
key_files = []
|
||||
for f in sorted(all_files):
|
||||
if f.startswith('01.') or f.startswith('02.') or f.startswith('03.') or f.startswith('04.') or \
|
||||
f.startswith('05.') or f.startswith('06.') or f.startswith('07.') or f.startswith('08.') or \
|
||||
f.startswith('09.') or f.startswith('10.') or f.startswith('11.') or f.startswith('12.') or f.startswith('13.'):
|
||||
key_files.append(f)
|
||||
|
||||
print(f"Found {len(key_files)} key files:")
|
||||
for f in key_files:
|
||||
print(f" - {f}")
|
||||
|
||||
results = {}
|
||||
|
||||
for filename in key_files:
|
||||
filepath = os.path.join(base_dir, filename)
|
||||
print(f"\nProcessing: {filename}")
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
print(f" File not found!")
|
||||
results[filename] = "File not found"
|
||||
continue
|
||||
|
||||
print(f" File exists, size: {os.path.getsize(filepath)} bytes")
|
||||
|
||||
try:
|
||||
ole = olefile.OleFileIO(filepath)
|
||||
|
||||
# List all streams
|
||||
streams = ole.listdir()
|
||||
print(f" Streams found: {len(streams)}")
|
||||
for s in streams[:10]:
|
||||
print(f" - {'/'.join(s)}")
|
||||
|
||||
# Look for text content in various streams
|
||||
text_content = []
|
||||
|
||||
for stream_path in streams:
|
||||
stream_name = '/'.join(stream_path)
|
||||
try:
|
||||
data = ole.openstream(stream_path).read()
|
||||
# Try to decode as UTF-16 (common for Word docs)
|
||||
try:
|
||||
text = data.decode('utf-16-le', errors='ignore')
|
||||
# Filter out control characters
|
||||
clean_text = ''.join(c for c in text if c.isprintable() or c in '\n\r\t')
|
||||
if clean_text.strip() and len(clean_text.strip()) > 10:
|
||||
text_content.append(f"=== {stream_name} ===\n{clean_text[:1000]}")
|
||||
except:
|
||||
pass
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
ole.close()
|
||||
|
||||
if text_content:
|
||||
results[filename] = '\n\n'.join(text_content[:20])
|
||||
print(f" Extracted text from {len(text_content)} streams")
|
||||
else:
|
||||
results[filename] = "No text content found"
|
||||
print(f" No text content found")
|
||||
|
||||
except Exception as e:
|
||||
results[filename] = f"Error: {e}"
|
||||
print(f" Error: {e}")
|
||||
|
||||
# Save results
|
||||
with open(r"D:\医院绩效系统\ole_extracted.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n\nSaved to ole_extracted.json")
|
||||
|
||||
# Print summary
|
||||
print("\n\n=== EXTRACTED CONTENT ===\n")
|
||||
for filename, content in results.items():
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FILE: {filename}")
|
||||
print(f"{'='*80}")
|
||||
print(content[:3000] if len(content) > 3000 else content)
|
||||
Reference in New Issue
Block a user