Add File

2025-11-07 09:05:41 +08:00
parent 45a6742600
commit 8cca42e14c
1 changed files with 345 additions and 0 deletions
--- a/src/summeryanyfile/main.py
+++ b/src/summeryanyfile/main.py
@@ -0,0 +1,345 @@
+"""
+主入口模块 - 命令行界面和主要功能
+"""
+
+import asyncio
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+import click
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
+from rich.panel import Panel
+from rich.table import Table
+
+from .config.settings import load_settings, create_default_config, create_env_template
+from .core.models import ProcessingConfig, ChunkStrategy
+from .generators.ppt_generator import PPTOutlineGenerator
+from .utils.logger import setup_logging, get_logger
+from .utils.file_handler import FileHandler
+from .utils.validators import validate_file_path, validate_url, validate_config
+
+console = Console()
+logger = get_logger(__name__)
+
+
+@click.group()
+@click.option('--config', '-c', help='配置文件路径')
+@click.option('--log-level', default='INFO', help='日志级别')
+@click.option('--debug', is_flag=True, help='启用调试模式')
+@click.pass_context
+def cli(ctx, config, log_level, debug):
+    """通用文本转PPT大纲生成器"""
+    ctx.ensure_object(dict)
+    
+    # 设置日志
+    if debug:
+        log_level = 'DEBUG'
+    
+    setup_logging(level=log_level, rich_logging=True)
+    
+    # 加载设置
+    settings = load_settings(config_file=config, debug_mode=debug)
+    ctx.obj['settings'] = settings
+    
+    if debug:
+        console.print(f"[dim]配置加载完成: {settings.llm_provider}/{settings.llm_model}[/dim]")
+
+
+@cli.command()
+@click.argument('input_path')
+@click.option('--output', '-o', help='输出文件路径')
+@click.option('--encoding', help='文件编码')
+@click.option('--max-slides', type=int, help='最大幻灯片数量')
+@click.option('--min-slides', type=int, help='最小幻灯片数量')
+@click.option('--chunk-size', type=int, help='文档块大小')
+@click.option('--chunk-strategy', type=click.Choice(['paragraph', 'semantic', 'recursive', 'hybrid', 'fast']), help='分块策略')
+@click.option('--model', help='LLM模型名称')
+@click.option('--provider', type=click.Choice(['openai', 'anthropic', 'azure']), help='LLM提供商')
+@click.option('--temperature', type=float, help='温度参数 (0.0-2.0)')
+@click.option('--max-tokens', type=int, help='最大token数量')
+@click.option('--base-url', help='自定义OpenAI API端点URL')
+@click.option('--save-markdown', is_flag=True, help='保存转换后的Markdown文件到temp目录')
+@click.option('--temp-dir', help='自定义temp目录路径')
+@click.option('--no-magic-pdf', is_flag=True, help='禁用Magic-PDF，强制使用MarkItDown处理PDF')
+@click.option('--no-progress', is_flag=True, help='禁用进度条')
+@click.pass_context
+def generate(ctx, input_path, output, encoding, max_slides, min_slides, chunk_size, chunk_strategy,
+             model, provider, temperature, max_tokens, base_url, save_markdown, temp_dir, no_magic_pdf, no_progress):
+    """生成PPT大纲"""
+    settings = ctx.obj['settings']
+    
+    # 更新设置
+    if max_slides:
+        settings.max_slides = max_slides
+    if min_slides:
+        settings.min_slides = min_slides
+    if chunk_size:
+        settings.chunk_size = chunk_size
+    if chunk_strategy:
+        settings.chunk_strategy = chunk_strategy
+    if model:
+        settings.llm_model = model
+    if provider:
+        settings.llm_provider = provider
+    if temperature is not None:
+        settings.temperature = temperature
+    if max_tokens:
+        settings.max_tokens = max_tokens
+    if base_url:
+        settings.openai_base_url = base_url
+    
+    # 验证配置
+    config_errors = validate_config(settings.__dict__)
+    if config_errors:
+        console.print("[red]配置错误:[/red]")
+        for error in config_errors:
+            console.print(f"  • {error}")
+        sys.exit(1)
+    
+    # 运行生成
+    asyncio.run(_run_generation(
+        input_path, output, encoding, settings, not no_progress, save_markdown, temp_dir, not no_magic_pdf
+    ))
+
+
+async def _run_generation(
+    input_path: str,
+    output_path: Optional[str],
+    encoding: Optional[str],
+    settings,
+    show_progress: bool,
+    save_markdown: bool = False,
+    temp_dir: Optional[str] = None,
+    use_magic_pdf: bool = True
+):
+    """运行PPT生成"""
+    try:
+        # 验证输入
+        if validate_url(input_path):
+            console.print(f"[blue]正在处理URL:[/blue] {input_path}")
+            file_handler = FileHandler()
+            local_path = file_handler.handle_input(input_path)[0]
+            is_temp = True
+        elif validate_file_path(input_path):
+            console.print(f"[blue]正在处理文件:[/blue] {input_path}")
+            local_path = input_path
+            is_temp = False
+        else:
+            console.print(f"[red]无效的输入路径:[/red] {input_path}")
+            sys.exit(1)
+        
+        # 创建生成器
+        config = settings.to_processing_config()
+        generator = PPTOutlineGenerator(
+            config,
+            save_markdown=save_markdown,
+            temp_dir=temp_dir,
+            use_magic_pdf=use_magic_pdf,
+            cache_dir=None  # 使用默认缓存目录
+        )
+
+        # 显示配置信息
+        if save_markdown:
+            markdown_dir = temp_dir or generator.document_processor.temp_dir
+            console.print(f"[yellow]Markdown文件将保存到:[/yellow] {markdown_dir}")
+
+        # 显示PDF处理方式
+        if use_magic_pdf:
+            console.print("[green]PDF转换:[/green] 使用Magic-PDF (本地高质量)")
+        else:
+            console.print("[yellow]PDF转换:[/yellow] 使用MarkItDown (标准质量)")
+        
+        # 进度回调
+        progress_task = None
+        if show_progress:
+            progress = Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TimeElapsedColumn(),
+                console=console
+            )
+            progress.start()
+            progress_task = progress.add_task("初始化...", total=100)
+            
+            def progress_callback(step_name: str, percent: float):
+                progress.update(progress_task, description=step_name, completed=percent)
+        else:
+            def progress_callback(step_name: str, percent: float):
+                console.print(f"[dim]{step_name} ({percent:.1f}%)[/dim]")
+        
+        try:
+            # 生成PPT大纲
+            outline = await generator.generate_from_file(
+                local_path,
+                encoding=encoding,
+                progress_callback=progress_callback
+            )
+            
+            if show_progress:
+                progress.stop()
+            
+            # 输出结果
+            result_json = outline.to_dict()
+            
+            if output_path:
+                # 保存到文件
+                output_file = Path(output_path)
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    json.dump(result_json, f, ensure_ascii=False, indent=2)
+                
+                console.print(f"[green]PPT大纲已保存到:[/green] {output_file}")
+            else:
+                # 输出到控制台
+                console.print(json.dumps(result_json, ensure_ascii=False, indent=2))
+            
+            # 显示摘要
+            _show_generation_summary(outline)
+            
+        finally:
+            if show_progress and progress_task:
+                progress.stop()
+            
+            # 清理临时文件
+            if is_temp:
+                file_handler.cleanup_temp_file(local_path)
+    
+    except Exception as e:
+        console.print(f"[red]生成失败:[/red] {e}")
+        logger.error(f"生成失败: {e}", exc_info=True)
+        sys.exit(1)
+
+
+def _show_generation_summary(outline):
+    """显示生成摘要"""
+    table = Table(title="PPT大纲摘要")
+    table.add_column("属性", style="cyan")
+    table.add_column("值", style="green")
+    
+    table.add_row("标题", outline.title)
+    table.add_row("总页数", str(outline.total_pages))
+    table.add_row("状态", outline.page_count_mode)
+    
+    # 统计幻灯片类型
+    type_counts = {}
+    for slide in outline.slides:
+        slide_type = slide.slide_type
+        type_counts[slide_type] = type_counts.get(slide_type, 0) + 1
+    
+    type_summary = ", ".join([f"{k}: {v}" for k, v in type_counts.items()])
+    table.add_row("幻灯片类型分布", type_summary)
+    
+    console.print(table)
+
+
+@cli.command()
+@click.option('--provider', type=click.Choice(['openai', 'anthropic', 'azure']), help='LLM提供商')
+def validate_setup(provider):
+    """验证设置和API连接"""
+    console.print("[blue]正在验证设置...[/blue]")
+    
+    settings = load_settings()
+    
+    # 验证配置
+    config_errors = validate_config(settings.__dict__)
+    if config_errors:
+        console.print("[red]配置错误:[/red]")
+        for error in config_errors:
+            console.print(f"  • {error}")
+        return
+    
+    # 验证LLM连接
+    from .core.llm_manager import LLMManager
+    
+    llm_manager = LLMManager()
+    test_provider = provider or settings.llm_provider
+    
+    if llm_manager.validate_configuration(test_provider, **settings.get_llm_kwargs()):
+        console.print(f"[green]✓[/green] {test_provider} 配置有效")
+    else:
+        console.print(f"[red]✗[/red] {test_provider} 配置无效")
+        return
+    
+    # 测试LLM连接
+    try:
+        llm = llm_manager.get_llm(
+            model=settings.llm_model,
+            provider=test_provider,
+            **settings.get_llm_kwargs()
+        )
+        console.print(f"[green]✓[/green] LLM连接成功: {settings.llm_model}")
+    except Exception as e:
+        console.print(f"[red]✗[/red] LLM连接失败: {e}")
+
+
+@cli.command()
+def init_config():
+    """初始化配置文件"""
+    console.print("[blue]正在创建配置文件...[/blue]")
+    
+    create_default_config()
+    create_env_template()
+    
+    console.print("[green]配置文件创建完成![/green]")
+    console.print("请编辑 .env 文件并填入您的API密钥")
+
+
+@cli.command()
+@click.argument('input_path')
+def analyze(input_path):
+    """分析文档结构"""
+    console.print(f"[blue]正在分析文档:[/blue] {input_path}")
+    
+    try:
+        from .core.document_processor import DocumentProcessor
+        
+        processor = DocumentProcessor()
+        doc_info = processor.load_document(input_path)
+        
+        # 显示文档信息
+        info_table = Table(title="文档信息")
+        info_table.add_column("属性", style="cyan")
+        info_table.add_column("值", style="green")
+        
+        info_table.add_row("标题", doc_info.title)
+        info_table.add_row("文件类型", doc_info.file_type)
+        info_table.add_row("编码", doc_info.encoding)
+        info_table.add_row("大小", f"{doc_info.size:,} 字节")
+        info_table.add_row("内容长度", f"{len(doc_info.content):,} 字符")
+        
+        console.print(info_table)
+        
+        # 显示内容预览
+        preview = doc_info.content[:500] + "..." if len(doc_info.content) > 500 else doc_info.content
+        console.print(Panel(preview, title="内容预览"))
+        
+    except Exception as e:
+        console.print(f"[red]分析失败:[/red] {e}")
+
+
+@cli.command()
+def list_models():
+    """列出支持的模型"""
+    from .core.llm_manager import LLMManager
+    
+    llm_manager = LLMManager()
+    
+    for provider in llm_manager.SUPPORTED_PROVIDERS:
+        models = llm_manager.list_available_models(provider)
+        
+        table = Table(title=f"{provider.upper()} 模型")
+        table.add_column("模型名称", style="cyan")
+        
+        for model in models:
+            table.add_row(model)
+        
+        console.print(table)
+
+
+if __name__ == '__main__':
+    cli()