diff --git a/src/skill_seekers/cli/multilang_support.py b/src/skill_seekers/cli/multilang_support.py new file mode 100644 index 0000000..169155e --- /dev/null +++ b/src/skill_seekers/cli/multilang_support.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +""" +Multi-language Documentation Support + +Provides language detection, multi-language structure handling, +and translation-ready format generation. +""" + +import re +from pathlib import Path +from typing import Dict, List, Optional, Set +from dataclasses import dataclass +from collections import Counter +import json + + +@dataclass +class LanguageInfo: + """Language information for a document.""" + code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh') + name: str # Full name (e.g., 'English', 'Spanish', 'Chinese') + confidence: float # Detection confidence (0.0-1.0) + script: Optional[str] = None # Script type (e.g., 'Latin', 'Cyrillic') + + +@dataclass +class TranslationStatus: + """Translation status for a document.""" + source_language: str + target_languages: List[str] + translated_languages: Set[str] + missing_languages: Set[str] + completeness: float # Percentage (0.0-1.0) + + +class LanguageDetector: + """ + Detect document language using heuristics. + + Uses character patterns, common words, and script detection. + """ + + # Common word patterns by language + LANGUAGE_PATTERNS = { + 'en': [ + r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b', + r'\b(this|that|these|those|what|which|who|where|when)\b', + ], + 'es': [ + r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b', + r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b', + ], + 'fr': [ + r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b', + r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b', + ], + 'de': [ + r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b', + r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b', + ], + 'zh': [ + r'[\u4e00-\u9fff]', # Chinese characters + r'(的|了|和|是|在|有|我|他|不|这)', + ], + 'ja': [ + r'[\u3040-\u309f]', # Hiragana + r'[\u30a0-\u30ff]', # Katakana + r'[\u4e00-\u9faf]', # Kanji + ], + 'ko': [ + r'[\uac00-\ud7af]', # Hangul + r'(의|가|이|은|들|는|좀|잘|께|을)', + ], + 'ru': [ + r'[\u0400-\u04ff]', # Cyrillic + r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b', + ], + 'pt': [ + r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b', + r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b', + ], + 'it': [ + r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b', + r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b', + ], + 'ar': [ + r'[\u0600-\u06ff]', # Arabic + r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)', + ], + } + + # Language names + LANGUAGE_NAMES = { + 'en': 'English', + 'es': 'Spanish', + 'fr': 'French', + 'de': 'German', + 'zh': 'Chinese', + 'ja': 'Japanese', + 'ko': 'Korean', + 'ru': 'Russian', + 'pt': 'Portuguese', + 'it': 'Italian', + 'ar': 'Arabic', + } + + # Script types + SCRIPTS = { + 'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin', + 'pt': 'Latin', 'it': 'Latin', + 'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul', + 'ru': 'Cyrillic', 'ar': 'Arabic', + } + + def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo: + """ + Detect language of text. + + Args: + text: Text to analyze + sample_size: Number of characters to sample + + Returns: + LanguageInfo with detected language + """ + if not text.strip(): + return LanguageInfo('en', 'English', 0.0) + + # Sample text for efficiency + sample = text[:sample_size].lower() + + # Score each language + scores = {} + for lang_code, patterns in self.LANGUAGE_PATTERNS.items(): + score = 0 + for pattern in patterns: + matches = len(re.findall(pattern, sample, re.IGNORECASE)) + score += matches + + scores[lang_code] = score + + # Find best match + if not scores or max(scores.values()) == 0: + # Default to English + return LanguageInfo('en', 'English', 0.1) + + best_lang = max(scores, key=scores.get) + total_score = sum(scores.values()) + confidence = scores[best_lang] / total_score if total_score > 0 else 0.0 + + return LanguageInfo( + code=best_lang, + name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()), + confidence=min(confidence, 1.0), + script=self.SCRIPTS.get(best_lang) + ) + + def detect_from_filename(self, filename: str) -> Optional[str]: + """ + Detect language from filename pattern. + + Supports patterns like: + - file.en.md + - file_en.md + - en/file.md + - file-en.md + + Args: + filename: Filename to analyze + + Returns: + ISO 639-1 language code or None + """ + # Pattern: file.en.md + match = re.search(r'\.([a-z]{2})\.md$', filename) + if match and match.group(1) in self.LANGUAGE_NAMES: + return match.group(1) + + # Pattern: file_en.md or file-en.md + match = re.search(r'[_-]([a-z]{2})\.md$', filename) + if match and match.group(1) in self.LANGUAGE_NAMES: + return match.group(1) + + return None + + +class MultiLanguageManager: + """ + Manages multi-language documentation structure. + + Organizes documents by language and tracks translations. + """ + + def __init__(self): + """Initialize multi-language manager.""" + self.detector = LanguageDetector() + self.documents: Dict[str, List[Dict]] = {} # lang_code -> [docs] + self.primary_language: Optional[str] = None + + def add_document( + self, + file_path: str, + content: str, + metadata: Optional[Dict] = None, + force_language: Optional[str] = None + ) -> None: + """ + Add document with language detection. + + Args: + file_path: Path to document + content: Document content + metadata: Additional metadata + force_language: Override language detection + """ + # Detect language + if force_language: + lang_code = force_language + lang_info = LanguageInfo( + code=lang_code, + name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), + confidence=1.0, + script=self.detector.SCRIPTS.get(lang_code) + ) + else: + # Try filename pattern first + filename_lang = self.detector.detect_from_filename(file_path) + if filename_lang: + lang_code = filename_lang + lang_info = LanguageInfo( + code=lang_code, + name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), + confidence=0.95, + script=self.detector.SCRIPTS.get(lang_code) + ) + else: + # Detect from content + lang_info = self.detector.detect(content) + lang_code = lang_info.code + + # Set primary language (first added or most common) + if self.primary_language is None: + self.primary_language = lang_code + + # Store document + if lang_code not in self.documents: + self.documents[lang_code] = [] + + doc = { + 'file_path': file_path, + 'content': content, + 'language': lang_info.code, + 'language_name': lang_info.name, + 'confidence': lang_info.confidence, + 'script': lang_info.script, + 'metadata': metadata or {} + } + + self.documents[lang_code].append(doc) + + def get_languages(self) -> List[str]: + """Get list of detected languages.""" + return sorted(self.documents.keys()) + + def get_document_count(self, language: Optional[str] = None) -> int: + """ + Get document count for a language. + + Args: + language: Language code (None for all) + + Returns: + Number of documents + """ + if language: + return len(self.documents.get(language, [])) + return sum(len(docs) for docs in self.documents.values()) + + def get_translation_status(self, base_language: Optional[str] = None) -> TranslationStatus: + """ + Get translation status. + + Args: + base_language: Base language (None for primary) + + Returns: + Translation status summary + """ + base_lang = base_language or self.primary_language or 'en' + + all_languages = set(self.documents.keys()) + base_count = self.get_document_count(base_lang) + + if base_count == 0: + return TranslationStatus( + source_language=base_lang, + target_languages=[], + translated_languages=set(), + missing_languages=set(), + completeness=0.0 + ) + + # Check which languages have translations + translated = set() + for lang in all_languages: + if lang != base_lang and self.get_document_count(lang) > 0: + translated.add(lang) + + # Commonly expected languages for completeness + expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'} + missing = expected_languages - all_languages + + completeness = len(all_languages) / len(expected_languages) + + return TranslationStatus( + source_language=base_lang, + target_languages=list(all_languages - {base_lang}), + translated_languages=translated, + missing_languages=missing, + completeness=min(completeness, 1.0) + ) + + def export_by_language(self, output_dir: Path) -> Dict[str, Path]: + """ + Export documents organized by language. + + Args: + output_dir: Output directory + + Returns: + Dictionary mapping language codes to output paths + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + exports = {} + + for lang_code, docs in self.documents.items(): + lang_file = output_dir / f"documents_{lang_code}.json" + + export_data = { + 'language': lang_code, + 'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), + 'document_count': len(docs), + 'documents': docs + } + + lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False)) + exports[lang_code] = lang_file + + return exports + + def generate_translation_report(self) -> str: + """ + Generate human-readable translation report. + + Returns: + Formatted report string + """ + lines = ["=" * 60] + lines.append("MULTI-LANGUAGE DOCUMENTATION REPORT") + lines.append("=" * 60) + lines.append("") + + # Summary + languages = self.get_languages() + total_docs = self.get_document_count() + + lines.append("📊 Summary:") + lines.append(f" Languages: {len(languages)}") + lines.append(f" Total documents: {total_docs}") + lines.append(f" Primary language: {self.primary_language or 'Unknown'}") + lines.append("") + + # Language breakdown + lines.append("🌍 Language Breakdown:") + for lang in languages: + count = self.get_document_count(lang) + lang_name = self.detector.LANGUAGE_NAMES.get(lang, lang.upper()) + percentage = (count / total_docs * 100) if total_docs > 0 else 0 + lines.append(f" {lang_name} ({lang}): {count} docs ({percentage:.1f}%)") + lines.append("") + + # Translation status + status = self.get_translation_status() + lines.append("📝 Translation Status:") + lines.append(f" Source: {status.source_language}") + lines.append(f" Translated to: {', '.join(status.translated_languages) or 'None'}") + lines.append(f" Completeness: {status.completeness * 100:.1f}%") + + if status.missing_languages: + lines.append(f" Missing: {', '.join(sorted(status.missing_languages))}") + lines.append("") + + lines.append("=" * 60) + + return "\n".join(lines) + + +def example_usage(): + """Example usage of multi-language support.""" + from pathlib import Path + + manager = MultiLanguageManager() + + # Add documents in different languages + manager.add_document( + "README.md", + "# Getting Started\n\nThis is an English document about the project.", + {"category": "overview"} + ) + + manager.add_document( + "README.es.md", + "# Empezando\n\nEste es un documento en español sobre el proyecto.", + {"category": "overview"} + ) + + manager.add_document( + "README.fr.md", + "# Commencer\n\nCeci est un document en français sur le projet.", + {"category": "overview"} + ) + + # Generate report + print(manager.generate_translation_report()) + + # Export by language + exports = manager.export_by_language(Path("output/multilang")) + print(f"\n✅ Exported {len(exports)} language files:") + for lang, path in exports.items(): + print(f" {lang}: {path}") + + +if __name__ == "__main__": + example_usage() diff --git a/tests/test_multilang_support.py b/tests/test_multilang_support.py new file mode 100644 index 0000000..0c390e6 --- /dev/null +++ b/tests/test_multilang_support.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +""" +Tests for multi-language documentation support. + +Validates: +- Language detection (content and filename) +- Multi-language organization +- Translation status tracking +- Language filtering +- Export by language +""" + +import pytest +from pathlib import Path +import sys +import tempfile +import json + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from skill_seekers.cli.multilang_support import ( + LanguageDetector, + MultiLanguageManager, + LanguageInfo +) + + +def test_detect_english(): + """Test English language detection.""" + detector = LanguageDetector() + + text = "This is an English document. It contains common English words." + lang_info = detector.detect(text) + + assert lang_info.code == 'en' + assert lang_info.name == 'English' + assert lang_info.confidence > 0.0 + + +def test_detect_spanish(): + """Test Spanish language detection.""" + detector = LanguageDetector() + + text = "Este es un documento en español. Contiene palabras comunes en español." + lang_info = detector.detect(text) + + assert lang_info.code == 'es' + assert lang_info.name == 'Spanish' + + +def test_detect_french(): + """Test French language detection.""" + detector = LanguageDetector() + + text = "Ceci est un document en français. Il contient des mots français communs." + lang_info = detector.detect(text) + + assert lang_info.code == 'fr' + assert lang_info.name == 'French' + + +def test_detect_german(): + """Test German language detection.""" + detector = LanguageDetector() + + text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter." + lang_info = detector.detect(text) + + assert lang_info.code == 'de' + assert lang_info.name == 'German' + + +def test_detect_chinese(): + """Test Chinese language detection.""" + detector = LanguageDetector() + + text = "这是一个中文文档。它包含常见的中文字符。" + lang_info = detector.detect(text) + + assert lang_info.code == 'zh' + assert lang_info.name == 'Chinese' + + +def test_detect_from_filename_dot_pattern(): + """Test language detection from filename (file.en.md pattern).""" + detector = LanguageDetector() + + assert detector.detect_from_filename("README.en.md") == 'en' + assert detector.detect_from_filename("guide.es.md") == 'es' + assert detector.detect_from_filename("doc.fr.md") == 'fr' + + +def test_detect_from_filename_underscore_pattern(): + """Test language detection from filename (file_en.md pattern).""" + detector = LanguageDetector() + + assert detector.detect_from_filename("README_en.md") == 'en' + assert detector.detect_from_filename("guide_es.md") == 'es' + + +def test_detect_from_filename_dash_pattern(): + """Test language detection from filename (file-en.md pattern).""" + detector = LanguageDetector() + + assert detector.detect_from_filename("README-en.md") == 'en' + assert detector.detect_from_filename("guide-es.md") == 'es' + + +def test_detect_from_filename_no_match(): + """Test filename with no language pattern.""" + detector = LanguageDetector() + + assert detector.detect_from_filename("README.md") is None + assert detector.detect_from_filename("guide.txt") is None + + +def test_add_document_single_language(): + """Test adding documents in single language.""" + manager = MultiLanguageManager() + + manager.add_document( + "README.md", + "This is an English document.", + {"category": "overview"} + ) + + assert len(manager.get_languages()) == 1 + assert 'en' in manager.get_languages() + assert manager.get_document_count('en') == 1 + + +def test_add_document_multiple_languages(): + """Test adding documents in multiple languages.""" + manager = MultiLanguageManager() + + manager.add_document("README.md", "This is English.", {}) + manager.add_document("README.es.md", "Esto es español.", {}) + manager.add_document("README.fr.md", "Ceci est français.", {}) + + assert len(manager.get_languages()) == 3 + assert 'en' in manager.get_languages() + assert 'es' in manager.get_languages() + assert 'fr' in manager.get_languages() + + +def test_force_language(): + """Test forcing language override.""" + manager = MultiLanguageManager() + + # Force Spanish despite English content + manager.add_document( + "file.md", + "This is actually English content.", + {}, + force_language='es' + ) + + assert 'es' in manager.get_languages() + assert manager.get_document_count('es') == 1 + + +def test_filename_language_priority(): + """Test filename pattern takes priority over content detection.""" + manager = MultiLanguageManager() + + # Filename says Spanish, but content is English + manager.add_document( + "guide.es.md", + "This is English content.", + {} + ) + + # Should use filename language + assert 'es' in manager.get_languages() + + +def test_document_count_all(): + """Test total document count.""" + manager = MultiLanguageManager() + + manager.add_document("file1.md", "English doc 1", {}) + manager.add_document("file2.md", "English doc 2", {}) + manager.add_document("file3.es.md", "Spanish doc", {}) + + assert manager.get_document_count() == 3 + assert manager.get_document_count('en') == 2 + assert manager.get_document_count('es') == 1 + + +def test_primary_language(): + """Test primary language is set correctly.""" + manager = MultiLanguageManager() + + manager.add_document("file1.md", "First English doc", {}) + manager.add_document("file2.es.md", "Spanish doc", {}) + + # Primary should be first added + assert manager.primary_language == 'en' + + +def test_translation_status(): + """Test translation status tracking.""" + manager = MultiLanguageManager() + + manager.add_document("README.md", "English doc", {}) + manager.add_document("README.es.md", "Spanish doc", {}) + manager.add_document("README.fr.md", "French doc", {}) + + status = manager.get_translation_status() + + assert status.source_language == 'en' + assert 'es' in status.translated_languages + assert 'fr' in status.translated_languages + assert len(status.translated_languages) == 2 + + +def test_export_by_language(): + """Test exporting documents by language.""" + manager = MultiLanguageManager() + + manager.add_document("file1.md", "English content", {}) + manager.add_document("file2.es.md", "Spanish content", {}) + + with tempfile.TemporaryDirectory() as tmpdir: + exports = manager.export_by_language(Path(tmpdir)) + + assert len(exports) == 2 + assert 'en' in exports + assert 'es' in exports + + # Check files exist + assert exports['en'].exists() + assert exports['es'].exists() + + # Check content + en_data = json.loads(exports['en'].read_text()) + assert en_data['language'] == 'en' + assert en_data['document_count'] == 1 + + +def test_translation_report_generation(): + """Test translation report generation.""" + manager = MultiLanguageManager() + + manager.add_document("file1.md", "English doc", {}) + manager.add_document("file2.es.md", "Spanish doc", {}) + + report = manager.generate_translation_report() + + assert "MULTI-LANGUAGE DOCUMENTATION REPORT" in report + assert "Languages: 2" in report + assert "English (en)" in report + assert "Spanish (es)" in report + + +def test_empty_manager(): + """Test manager with no documents.""" + manager = MultiLanguageManager() + + assert len(manager.get_languages()) == 0 + assert manager.get_document_count() == 0 + assert manager.primary_language is None + + +def test_script_detection(): + """Test script type detection.""" + detector = LanguageDetector() + + # English uses Latin script + en_info = detector.detect("This is English") + assert en_info.script == 'Latin' + + # Chinese uses Han script + zh_info = detector.detect("这是中文") + assert zh_info.script == 'Han' + + +def test_confidence_scoring(): + """Test confidence scoring.""" + detector = LanguageDetector() + + # Strong English signal + strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English." + lang_info = detector.detect(strong_en) + + assert lang_info.code == 'en' + assert lang_info.confidence > 0.3 # Should have decent confidence + + +def test_metadata_preservation(): + """Test metadata is preserved.""" + manager = MultiLanguageManager() + + metadata = {"category": "guide", "version": "1.0"} + manager.add_document("file.md", "English content", metadata) + + docs = manager.documents['en'] + assert len(docs) == 1 + assert docs[0]['metadata'] == metadata + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])