#!/usr/bin/env python3 """ AI Processor - Stage 2: AI-powered Text Corrections SINGLE RESPONSIBILITY: Process text using GLM API for intelligent corrections Features: - Split text into chunks for API processing - Call GLM-4.6 for context-aware corrections - Track AI-suggested changes - Handle API errors gracefully """ from __future__ import annotations import os import re from typing import List, Tuple from dataclasses import dataclass import httpx @dataclass class AIChange: """Represents an AI-suggested change""" chunk_index: int from_text: str to_text: str confidence: float # 0.0 to 1.0 class AIProcessor: """ Stage 2 Processor: AI-powered corrections using GLM-4.6 Process: 1. Split text into chunks (respecting API limits) 2. Send each chunk to GLM API 3. Track changes for learning engine 4. Preserve formatting and structure """ def __init__(self, api_key: str, model: str = "GLM-4.6", base_url: str = "https://open.bigmodel.cn/api/anthropic", fallback_model: str = "GLM-4.5-Air"): """ Initialize AI processor Args: api_key: GLM API key model: Model name (default: GLM-4.6) base_url: API base URL fallback_model: Fallback model on primary failure """ self.api_key = api_key self.model = model self.fallback_model = fallback_model self.base_url = base_url self.max_chunk_size = 6000 # Characters per chunk def process(self, text: str, context: str = "") -> Tuple[str, List[AIChange]]: """ Process text with AI corrections Args: text: Text to correct context: Optional domain/meeting context Returns: (corrected_text, list_of_changes) """ chunks = self._split_into_chunks(text) corrected_chunks = [] all_changes = [] print(f"📝 Processing {len(chunks)} chunks with {self.model}...") for i, chunk in enumerate(chunks, 1): print(f" Chunk {i}/{len(chunks)}... ", end="", flush=True) try: corrected_chunk = self._process_chunk(chunk, context, self.model) corrected_chunks.append(corrected_chunk) # TODO: Extract actual changes for learning # For now, we assume the whole chunk changed if corrected_chunk != chunk: all_changes.append(AIChange( chunk_index=i, from_text=chunk[:50] + "...", to_text=corrected_chunk[:50] + "...", confidence=0.9 # Placeholder )) print("✓") except Exception as e: print(f"✗ {str(e)[:50]}") # Retry with fallback model if self.fallback_model and self.fallback_model != self.model: print(f" Retrying with {self.fallback_model}... ", end="", flush=True) try: corrected_chunk = self._process_chunk(chunk, context, self.fallback_model) corrected_chunks.append(corrected_chunk) print("✓") continue except Exception as e2: print(f"✗ {str(e2)[:50]}") print(" Using original text...") corrected_chunks.append(chunk) return "\n\n".join(corrected_chunks), all_changes def _split_into_chunks(self, text: str) -> List[str]: """ Split text into processable chunks Strategy: - Split by double newlines (paragraphs) - Keep chunks under max_chunk_size - Don't split mid-paragraph if possible """ paragraphs = text.split('\n\n') chunks = [] current_chunk = [] current_length = 0 for para in paragraphs: para_length = len(para) # If single paragraph exceeds limit, force split if para_length > self.max_chunk_size: if current_chunk: chunks.append('\n\n'.join(current_chunk)) current_chunk = [] current_length = 0 # Split long paragraph by sentences sentences = re.split(r'([。!?\n])', para) temp_para = "" for i in range(0, len(sentences), 2): sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else "") if len(temp_para) + len(sentence) > self.max_chunk_size: if temp_para: chunks.append(temp_para) temp_para = sentence else: temp_para += sentence if temp_para: chunks.append(temp_para) # Normal case: accumulate paragraphs elif current_length + para_length > self.max_chunk_size and current_chunk: chunks.append('\n\n'.join(current_chunk)) current_chunk = [para] current_length = para_length else: current_chunk.append(para) current_length += para_length + 2 # +2 for \n\n if current_chunk: chunks.append('\n\n'.join(current_chunk)) return chunks def _process_chunk(self, chunk: str, context: str, model: str) -> str: """Process a single chunk with GLM API""" prompt = self._build_prompt(chunk, context) url = f"{self.base_url}/v1/messages" headers = { "anthropic-version": "2023-06-01", "Authorization": f"Bearer {self.api_key}", "content-type": "application/json" } data = { "model": model, "max_tokens": 8000, "temperature": 0.3, "messages": [{"role": "user", "content": prompt}] } with httpx.Client(timeout=60.0) as client: response = client.post(url, headers=headers, json=data) response.raise_for_status() result = response.json() return result["content"][0]["text"] def _build_prompt(self, chunk: str, context: str) -> str: """Build correction prompt for GLM""" base_prompt = """你是专业的会议记录校对专家。请修复以下会议转录中的语音识别错误。 **修复原则**: 1. 严格保留原有格式(时间戳、发言人标识、Markdown标记等) 2. 修复明显的同音字错误 3. 修复专业术语错误 4. 修复语法错误,但保持口语化特征 5. 不确定的地方保持原样,不要过度修改 """ if context: base_prompt += f"\n**会议背景**:\n{context}\n" base_prompt += f""" **需要修复的内容**: {chunk} **请直接输出修复后的文本,不要添加任何解释或标注**:""" return base_prompt