feat: Add multi-language documentation support (Task #16)

- Language detection (11 languages supported)
- Filename pattern recognition (file.en.md, file_en.md, file-en.md)
- Content-based detection with confidence scoring
- Multi-language organization and filtering
- Translation status tracking
- Export by language capability
- 22 tests passing (100%)

Files:
- multilang_support.py: Core language engine
- test_multilang_support.py: Comprehensive tests

Supported Languages:
- English, Spanish, French, German, Portuguese, Italian
- Chinese, Japanese, Korean
- Russian, Arabic

Features:
- LanguageDetector with pattern matching
- MultiLanguageManager for organization
- Translation completeness tracking
- Script detection (Latin, Han, Cyrillic, etc.)
- Export to language-specific files

Week 2: 7/9 tasks complete (78%)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-07 13:45:01 +03:00
parent 7762d10273
commit 261f28f7ee
2 changed files with 740 additions and 0 deletions

View File

@@ -0,0 +1,436 @@
#!/usr/bin/env python3
"""
Multi-language Documentation Support
Provides language detection, multi-language structure handling,
and translation-ready format generation.
"""
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from dataclasses import dataclass
from collections import Counter
import json
@dataclass
class LanguageInfo:
"""Language information for a document."""
code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh')
name: str # Full name (e.g., 'English', 'Spanish', 'Chinese')
confidence: float # Detection confidence (0.0-1.0)
script: Optional[str] = None # Script type (e.g., 'Latin', 'Cyrillic')
@dataclass
class TranslationStatus:
"""Translation status for a document."""
source_language: str
target_languages: List[str]
translated_languages: Set[str]
missing_languages: Set[str]
completeness: float # Percentage (0.0-1.0)
class LanguageDetector:
"""
Detect document language using heuristics.
Uses character patterns, common words, and script detection.
"""
# Common word patterns by language
LANGUAGE_PATTERNS = {
'en': [
r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b',
r'\b(this|that|these|those|what|which|who|where|when)\b',
],
'es': [
r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b',
r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b',
],
'fr': [
r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b',
r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b',
],
'de': [
r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b',
r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b',
],
'zh': [
r'[\u4e00-\u9fff]', # Chinese characters
r'(的|了|和|是|在|有|我|他|不|这)',
],
'ja': [
r'[\u3040-\u309f]', # Hiragana
r'[\u30a0-\u30ff]', # Katakana
r'[\u4e00-\u9faf]', # Kanji
],
'ko': [
r'[\uac00-\ud7af]', # Hangul
r'(의|가|이|은|들|는|좀|잘|께|을)',
],
'ru': [
r'[\u0400-\u04ff]', # Cyrillic
r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b',
],
'pt': [
r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b',
r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b',
],
'it': [
r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b',
r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b',
],
'ar': [
r'[\u0600-\u06ff]', # Arabic
r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)',
],
}
# Language names
LANGUAGE_NAMES = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'zh': 'Chinese',
'ja': 'Japanese',
'ko': 'Korean',
'ru': 'Russian',
'pt': 'Portuguese',
'it': 'Italian',
'ar': 'Arabic',
}
# Script types
SCRIPTS = {
'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin',
'pt': 'Latin', 'it': 'Latin',
'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul',
'ru': 'Cyrillic', 'ar': 'Arabic',
}
def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo:
"""
Detect language of text.
Args:
text: Text to analyze
sample_size: Number of characters to sample
Returns:
LanguageInfo with detected language
"""
if not text.strip():
return LanguageInfo('en', 'English', 0.0)
# Sample text for efficiency
sample = text[:sample_size].lower()
# Score each language
scores = {}
for lang_code, patterns in self.LANGUAGE_PATTERNS.items():
score = 0
for pattern in patterns:
matches = len(re.findall(pattern, sample, re.IGNORECASE))
score += matches
scores[lang_code] = score
# Find best match
if not scores or max(scores.values()) == 0:
# Default to English
return LanguageInfo('en', 'English', 0.1)
best_lang = max(scores, key=scores.get)
total_score = sum(scores.values())
confidence = scores[best_lang] / total_score if total_score > 0 else 0.0
return LanguageInfo(
code=best_lang,
name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()),
confidence=min(confidence, 1.0),
script=self.SCRIPTS.get(best_lang)
)
def detect_from_filename(self, filename: str) -> Optional[str]:
"""
Detect language from filename pattern.
Supports patterns like:
- file.en.md
- file_en.md
- en/file.md
- file-en.md
Args:
filename: Filename to analyze
Returns:
ISO 639-1 language code or None
"""
# Pattern: file.en.md
match = re.search(r'\.([a-z]{2})\.md$', filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
# Pattern: file_en.md or file-en.md
match = re.search(r'[_-]([a-z]{2})\.md$', filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
return None
class MultiLanguageManager:
"""
Manages multi-language documentation structure.
Organizes documents by language and tracks translations.
"""
def __init__(self):
"""Initialize multi-language manager."""
self.detector = LanguageDetector()
self.documents: Dict[str, List[Dict]] = {} # lang_code -> [docs]
self.primary_language: Optional[str] = None
def add_document(
self,
file_path: str,
content: str,
metadata: Optional[Dict] = None,
force_language: Optional[str] = None
) -> None:
"""
Add document with language detection.
Args:
file_path: Path to document
content: Document content
metadata: Additional metadata
force_language: Override language detection
"""
# Detect language
if force_language:
lang_code = force_language
lang_info = LanguageInfo(
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=1.0,
script=self.detector.SCRIPTS.get(lang_code)
)
else:
# Try filename pattern first
filename_lang = self.detector.detect_from_filename(file_path)
if filename_lang:
lang_code = filename_lang
lang_info = LanguageInfo(
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=0.95,
script=self.detector.SCRIPTS.get(lang_code)
)
else:
# Detect from content
lang_info = self.detector.detect(content)
lang_code = lang_info.code
# Set primary language (first added or most common)
if self.primary_language is None:
self.primary_language = lang_code
# Store document
if lang_code not in self.documents:
self.documents[lang_code] = []
doc = {
'file_path': file_path,
'content': content,
'language': lang_info.code,
'language_name': lang_info.name,
'confidence': lang_info.confidence,
'script': lang_info.script,
'metadata': metadata or {}
}
self.documents[lang_code].append(doc)
def get_languages(self) -> List[str]:
"""Get list of detected languages."""
return sorted(self.documents.keys())
def get_document_count(self, language: Optional[str] = None) -> int:
"""
Get document count for a language.
Args:
language: Language code (None for all)
Returns:
Number of documents
"""
if language:
return len(self.documents.get(language, []))
return sum(len(docs) for docs in self.documents.values())
def get_translation_status(self, base_language: Optional[str] = None) -> TranslationStatus:
"""
Get translation status.
Args:
base_language: Base language (None for primary)
Returns:
Translation status summary
"""
base_lang = base_language or self.primary_language or 'en'
all_languages = set(self.documents.keys())
base_count = self.get_document_count(base_lang)
if base_count == 0:
return TranslationStatus(
source_language=base_lang,
target_languages=[],
translated_languages=set(),
missing_languages=set(),
completeness=0.0
)
# Check which languages have translations
translated = set()
for lang in all_languages:
if lang != base_lang and self.get_document_count(lang) > 0:
translated.add(lang)
# Commonly expected languages for completeness
expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'}
missing = expected_languages - all_languages
completeness = len(all_languages) / len(expected_languages)
return TranslationStatus(
source_language=base_lang,
target_languages=list(all_languages - {base_lang}),
translated_languages=translated,
missing_languages=missing,
completeness=min(completeness, 1.0)
)
def export_by_language(self, output_dir: Path) -> Dict[str, Path]:
"""
Export documents organized by language.
Args:
output_dir: Output directory
Returns:
Dictionary mapping language codes to output paths
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
exports = {}
for lang_code, docs in self.documents.items():
lang_file = output_dir / f"documents_{lang_code}.json"
export_data = {
'language': lang_code,
'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
'document_count': len(docs),
'documents': docs
}
lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False))
exports[lang_code] = lang_file
return exports
def generate_translation_report(self) -> str:
"""
Generate human-readable translation report.
Returns:
Formatted report string
"""
lines = ["=" * 60]
lines.append("MULTI-LANGUAGE DOCUMENTATION REPORT")
lines.append("=" * 60)
lines.append("")
# Summary
languages = self.get_languages()
total_docs = self.get_document_count()
lines.append("📊 Summary:")
lines.append(f" Languages: {len(languages)}")
lines.append(f" Total documents: {total_docs}")
lines.append(f" Primary language: {self.primary_language or 'Unknown'}")
lines.append("")
# Language breakdown
lines.append("🌍 Language Breakdown:")
for lang in languages:
count = self.get_document_count(lang)
lang_name = self.detector.LANGUAGE_NAMES.get(lang, lang.upper())
percentage = (count / total_docs * 100) if total_docs > 0 else 0
lines.append(f" {lang_name} ({lang}): {count} docs ({percentage:.1f}%)")
lines.append("")
# Translation status
status = self.get_translation_status()
lines.append("📝 Translation Status:")
lines.append(f" Source: {status.source_language}")
lines.append(f" Translated to: {', '.join(status.translated_languages) or 'None'}")
lines.append(f" Completeness: {status.completeness * 100:.1f}%")
if status.missing_languages:
lines.append(f" Missing: {', '.join(sorted(status.missing_languages))}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def example_usage():
"""Example usage of multi-language support."""
from pathlib import Path
manager = MultiLanguageManager()
# Add documents in different languages
manager.add_document(
"README.md",
"# Getting Started\n\nThis is an English document about the project.",
{"category": "overview"}
)
manager.add_document(
"README.es.md",
"# Empezando\n\nEste es un documento en español sobre el proyecto.",
{"category": "overview"}
)
manager.add_document(
"README.fr.md",
"# Commencer\n\nCeci est un document en français sur le projet.",
{"category": "overview"}
)
# Generate report
print(manager.generate_translation_report())
# Export by language
exports = manager.export_by_language(Path("output/multilang"))
print(f"\n✅ Exported {len(exports)} language files:")
for lang, path in exports.items():
print(f" {lang}: {path}")
if __name__ == "__main__":
example_usage()

View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python3
"""
Tests for multi-language documentation support.
Validates:
- Language detection (content and filename)
- Multi-language organization
- Translation status tracking
- Language filtering
- Export by language
"""
import pytest
from pathlib import Path
import sys
import tempfile
import json
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from skill_seekers.cli.multilang_support import (
LanguageDetector,
MultiLanguageManager,
LanguageInfo
)
def test_detect_english():
"""Test English language detection."""
detector = LanguageDetector()
text = "This is an English document. It contains common English words."
lang_info = detector.detect(text)
assert lang_info.code == 'en'
assert lang_info.name == 'English'
assert lang_info.confidence > 0.0
def test_detect_spanish():
"""Test Spanish language detection."""
detector = LanguageDetector()
text = "Este es un documento en español. Contiene palabras comunes en español."
lang_info = detector.detect(text)
assert lang_info.code == 'es'
assert lang_info.name == 'Spanish'
def test_detect_french():
"""Test French language detection."""
detector = LanguageDetector()
text = "Ceci est un document en français. Il contient des mots français communs."
lang_info = detector.detect(text)
assert lang_info.code == 'fr'
assert lang_info.name == 'French'
def test_detect_german():
"""Test German language detection."""
detector = LanguageDetector()
text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter."
lang_info = detector.detect(text)
assert lang_info.code == 'de'
assert lang_info.name == 'German'
def test_detect_chinese():
"""Test Chinese language detection."""
detector = LanguageDetector()
text = "这是一个中文文档。它包含常见的中文字符。"
lang_info = detector.detect(text)
assert lang_info.code == 'zh'
assert lang_info.name == 'Chinese'
def test_detect_from_filename_dot_pattern():
"""Test language detection from filename (file.en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README.en.md") == 'en'
assert detector.detect_from_filename("guide.es.md") == 'es'
assert detector.detect_from_filename("doc.fr.md") == 'fr'
def test_detect_from_filename_underscore_pattern():
"""Test language detection from filename (file_en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README_en.md") == 'en'
assert detector.detect_from_filename("guide_es.md") == 'es'
def test_detect_from_filename_dash_pattern():
"""Test language detection from filename (file-en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README-en.md") == 'en'
assert detector.detect_from_filename("guide-es.md") == 'es'
def test_detect_from_filename_no_match():
"""Test filename with no language pattern."""
detector = LanguageDetector()
assert detector.detect_from_filename("README.md") is None
assert detector.detect_from_filename("guide.txt") is None
def test_add_document_single_language():
"""Test adding documents in single language."""
manager = MultiLanguageManager()
manager.add_document(
"README.md",
"This is an English document.",
{"category": "overview"}
)
assert len(manager.get_languages()) == 1
assert 'en' in manager.get_languages()
assert manager.get_document_count('en') == 1
def test_add_document_multiple_languages():
"""Test adding documents in multiple languages."""
manager = MultiLanguageManager()
manager.add_document("README.md", "This is English.", {})
manager.add_document("README.es.md", "Esto es español.", {})
manager.add_document("README.fr.md", "Ceci est français.", {})
assert len(manager.get_languages()) == 3
assert 'en' in manager.get_languages()
assert 'es' in manager.get_languages()
assert 'fr' in manager.get_languages()
def test_force_language():
"""Test forcing language override."""
manager = MultiLanguageManager()
# Force Spanish despite English content
manager.add_document(
"file.md",
"This is actually English content.",
{},
force_language='es'
)
assert 'es' in manager.get_languages()
assert manager.get_document_count('es') == 1
def test_filename_language_priority():
"""Test filename pattern takes priority over content detection."""
manager = MultiLanguageManager()
# Filename says Spanish, but content is English
manager.add_document(
"guide.es.md",
"This is English content.",
{}
)
# Should use filename language
assert 'es' in manager.get_languages()
def test_document_count_all():
"""Test total document count."""
manager = MultiLanguageManager()
manager.add_document("file1.md", "English doc 1", {})
manager.add_document("file2.md", "English doc 2", {})
manager.add_document("file3.es.md", "Spanish doc", {})
assert manager.get_document_count() == 3
assert manager.get_document_count('en') == 2
assert manager.get_document_count('es') == 1
def test_primary_language():
"""Test primary language is set correctly."""
manager = MultiLanguageManager()
manager.add_document("file1.md", "First English doc", {})
manager.add_document("file2.es.md", "Spanish doc", {})
# Primary should be first added
assert manager.primary_language == 'en'
def test_translation_status():
"""Test translation status tracking."""
manager = MultiLanguageManager()
manager.add_document("README.md", "English doc", {})
manager.add_document("README.es.md", "Spanish doc", {})
manager.add_document("README.fr.md", "French doc", {})
status = manager.get_translation_status()
assert status.source_language == 'en'
assert 'es' in status.translated_languages
assert 'fr' in status.translated_languages
assert len(status.translated_languages) == 2
def test_export_by_language():
"""Test exporting documents by language."""
manager = MultiLanguageManager()
manager.add_document("file1.md", "English content", {})
manager.add_document("file2.es.md", "Spanish content", {})
with tempfile.TemporaryDirectory() as tmpdir:
exports = manager.export_by_language(Path(tmpdir))
assert len(exports) == 2
assert 'en' in exports
assert 'es' in exports
# Check files exist
assert exports['en'].exists()
assert exports['es'].exists()
# Check content
en_data = json.loads(exports['en'].read_text())
assert en_data['language'] == 'en'
assert en_data['document_count'] == 1
def test_translation_report_generation():
"""Test translation report generation."""
manager = MultiLanguageManager()
manager.add_document("file1.md", "English doc", {})
manager.add_document("file2.es.md", "Spanish doc", {})
report = manager.generate_translation_report()
assert "MULTI-LANGUAGE DOCUMENTATION REPORT" in report
assert "Languages: 2" in report
assert "English (en)" in report
assert "Spanish (es)" in report
def test_empty_manager():
"""Test manager with no documents."""
manager = MultiLanguageManager()
assert len(manager.get_languages()) == 0
assert manager.get_document_count() == 0
assert manager.primary_language is None
def test_script_detection():
"""Test script type detection."""
detector = LanguageDetector()
# English uses Latin script
en_info = detector.detect("This is English")
assert en_info.script == 'Latin'
# Chinese uses Han script
zh_info = detector.detect("这是中文")
assert zh_info.script == 'Han'
def test_confidence_scoring():
"""Test confidence scoring."""
detector = LanguageDetector()
# Strong English signal
strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English."
lang_info = detector.detect(strong_en)
assert lang_info.code == 'en'
assert lang_info.confidence > 0.3 # Should have decent confidence
def test_metadata_preservation():
"""Test metadata is preserved."""
manager = MultiLanguageManager()
metadata = {"category": "guide", "version": "1.0"}
manager.add_document("file.md", "English content", metadata)
docs = manager.documents['en']
assert len(docs) == 1
assert docs[0]['metadata'] == metadata
if __name__ == "__main__":
pytest.main([__file__, "-v"])