hospital_performance/extract_all_docs.py

#!/usr/bin/env python3
"""
提取所有PPT、PDF、XLS文档内容
"""
import os
import json
import sys

# 尝试导入各种库
try:
    from pptx import Presentation
    HAS_PPTX = True
except ImportError:
    HAS_PPTX = False
    print("Warning: python-pptx not available for .pptx files")

try:
    import pdfplumber
    HAS_PDFPLUMBER = True
except ImportError:
    HAS_PDFPLUMBER = False
    print("Warning: pdfplumber not available")

try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    HAS_PANDAS = False
    print("Warning: pandas not available")

try:
    import openpyxl
    HAS_OPENPYXL = True
except ImportError:
    HAS_OPENPYXL = False
    print("Warning: openpyxl not available")

try:
    import xlrd
    HAS_XLRD = True
except ImportError:
    HAS_XLRD = False
    print("Warning: xlrd not available")


def extract_ppt_content(filepath):
    """提取PPT文件内容"""
    content = []
    try:
        if HAS_PPTX and filepath.endswith('.pptx'):
            prs = Presentation(filepath)
            for slide_num, slide in enumerate(prs.slides, 1):
                slide_content = {
                    'slide_number': slide_num,
                    'text': [],
                    'tables': []
                }
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        slide_content['text'].append(shape.text.strip())
                    if shape.has_table:
                        table_data = []
                        for row in shape.table.rows:
                            row_data = [cell.text for cell in row.cells]
                            table_data.append(row_data)
                        slide_content['tables'].append(table_data)
                content.append(slide_content)
        else:
            # 对于旧的.ppt格式，尝试使用其他方法
            content = {'error': f'无法处理旧格式PPT文件: {filepath}'}
    except Exception as e:
        content = {'error': str(e)}
    return content


def extract_pdf_content(filepath):
    """提取PDF文件内容"""
    content = []
    try:
        if HAS_PDFPLUMBER:
            with pdfplumber.open(filepath) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    page_text = page.extract_text()
                    tables = page.extract_tables()
                    page_content = {
                        'page_number': page_num,
                        'text': page_text if page_text else '',
                        'tables': tables if tables else []
                    }
                    content.append(page_content)
        else:
            content = {'error': 'pdfplumber not available'}
    except Exception as e:
        content = {'error': str(e)}
    return content


def extract_xls_content(filepath):
    """提取XLS/XLSX文件内容"""
    content = {}
    try:
        if HAS_PANDAS:
            # 读取所有sheet
            if filepath.endswith('.xlsx'):
                xl_file = pd.ExcelFile(filepath, engine='openpyxl')
            else:
                xl_file = pd.ExcelFile(filepath, engine='xlrd')

            for sheet_name in xl_file.sheet_names:
                df = pd.read_excel(xl_file, sheet_name=sheet_name)
                # 转换为可序列化的格式
                content[sheet_name] = {
                    'columns': df.columns.tolist(),
                    'data': df.fillna('').values.tolist()
                }
        else:
            content = {'error': 'pandas not available'}
    except Exception as e:
        content = {'error': str(e)}
    return content


def process_directory(base_dir, output_file):
    """处理参考文档目录中的所有文件"""
    results = {
        'ppt_files': {},
        'pdf_files': {},
        'xls_files': {}
    }

    ref_dir = os.path.join(base_dir, '参考文档')

    for filename in sorted(os.listdir(ref_dir)):
        filepath = os.path.join(ref_dir, filename)

        if not os.path.isfile(filepath):
            continue

        print(f"处理文件: {filename}")

        try:
            if filename.endswith('.ppt') or filename.endswith('.pptx'):
                print(f"  -> 提取PPT内容...")
                content = extract_ppt_content(filepath)
                results['ppt_files'][filename] = content

            elif filename.endswith('.pdf'):
                print(f"  -> 提取PDF内容...")
                content = extract_pdf_content(filepath)
                results['pdf_files'][filename] = content

            elif filename.endswith('.xls') or filename.endswith('.xlsx'):
                print(f"  -> 提取XLS内容...")
                content = extract_xls_content(filepath)
                results['xls_files'][filename] = content
        except Exception as e:
            print(f"  -> 错误: {e}")
            continue

    # 保存结果
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n所有内容已保存到: {output_file}")
    return results


def main():
    base_dir = r'd:\医院绩效系统'
    output_file = os.path.join(base_dir, 'all_docs_content.json')

    print("=" * 60)
    print("开始提取所有文档内容")
    print("=" * 60)

    results = process_directory(base_dir, output_file)

    # 打印统计信息
    print("\n" + "=" * 60)
    print("提取完成统计:")
    print(f"  PPT文件: {len(results['ppt_files'])} 个")
    print(f"  PDF文件: {len(results['pdf_files'])} 个")
    print(f"  XLS文件: {len(results['xls_files'])} 个")
    print("=" * 60)


if __name__ == '__main__':
    main()