feat: Add multi-language documentation support (Task #16)
- Language detection (11 languages supported) - Filename pattern recognition (file.en.md, file_en.md, file-en.md) - Content-based detection with confidence scoring - Multi-language organization and filtering - Translation status tracking - Export by language capability - 22 tests passing (100%) Files: - multilang_support.py: Core language engine - test_multilang_support.py: Comprehensive tests Supported Languages: - English, Spanish, French, German, Portuguese, Italian - Chinese, Japanese, Korean - Russian, Arabic Features: - LanguageDetector with pattern matching - MultiLanguageManager for organization - Translation completeness tracking - Script detection (Latin, Han, Cyrillic, etc.) - Export to language-specific files Week 2: 7/9 tasks complete (78%) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
304
tests/test_multilang_support.py
Normal file
304
tests/test_multilang_support.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for multi-language documentation support.
|
||||
|
||||
Validates:
|
||||
- Language detection (content and filename)
|
||||
- Multi-language organization
|
||||
- Translation status tracking
|
||||
- Language filtering
|
||||
- Export by language
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import tempfile
|
||||
import json
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from skill_seekers.cli.multilang_support import (
|
||||
LanguageDetector,
|
||||
MultiLanguageManager,
|
||||
LanguageInfo
|
||||
)
|
||||
|
||||
|
||||
def test_detect_english():
|
||||
"""Test English language detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
text = "This is an English document. It contains common English words."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'en'
|
||||
assert lang_info.name == 'English'
|
||||
assert lang_info.confidence > 0.0
|
||||
|
||||
|
||||
def test_detect_spanish():
|
||||
"""Test Spanish language detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
text = "Este es un documento en español. Contiene palabras comunes en español."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'es'
|
||||
assert lang_info.name == 'Spanish'
|
||||
|
||||
|
||||
def test_detect_french():
|
||||
"""Test French language detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
text = "Ceci est un document en français. Il contient des mots français communs."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'fr'
|
||||
assert lang_info.name == 'French'
|
||||
|
||||
|
||||
def test_detect_german():
|
||||
"""Test German language detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'de'
|
||||
assert lang_info.name == 'German'
|
||||
|
||||
|
||||
def test_detect_chinese():
|
||||
"""Test Chinese language detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
text = "这是一个中文文档。它包含常见的中文字符。"
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'zh'
|
||||
assert lang_info.name == 'Chinese'
|
||||
|
||||
|
||||
def test_detect_from_filename_dot_pattern():
|
||||
"""Test language detection from filename (file.en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README.en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide.es.md") == 'es'
|
||||
assert detector.detect_from_filename("doc.fr.md") == 'fr'
|
||||
|
||||
|
||||
def test_detect_from_filename_underscore_pattern():
|
||||
"""Test language detection from filename (file_en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README_en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide_es.md") == 'es'
|
||||
|
||||
|
||||
def test_detect_from_filename_dash_pattern():
|
||||
"""Test language detection from filename (file-en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README-en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide-es.md") == 'es'
|
||||
|
||||
|
||||
def test_detect_from_filename_no_match():
|
||||
"""Test filename with no language pattern."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README.md") is None
|
||||
assert detector.detect_from_filename("guide.txt") is None
|
||||
|
||||
|
||||
def test_add_document_single_language():
|
||||
"""Test adding documents in single language."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document(
|
||||
"README.md",
|
||||
"This is an English document.",
|
||||
{"category": "overview"}
|
||||
)
|
||||
|
||||
assert len(manager.get_languages()) == 1
|
||||
assert 'en' in manager.get_languages()
|
||||
assert manager.get_document_count('en') == 1
|
||||
|
||||
|
||||
def test_add_document_multiple_languages():
|
||||
"""Test adding documents in multiple languages."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("README.md", "This is English.", {})
|
||||
manager.add_document("README.es.md", "Esto es español.", {})
|
||||
manager.add_document("README.fr.md", "Ceci est français.", {})
|
||||
|
||||
assert len(manager.get_languages()) == 3
|
||||
assert 'en' in manager.get_languages()
|
||||
assert 'es' in manager.get_languages()
|
||||
assert 'fr' in manager.get_languages()
|
||||
|
||||
|
||||
def test_force_language():
|
||||
"""Test forcing language override."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
# Force Spanish despite English content
|
||||
manager.add_document(
|
||||
"file.md",
|
||||
"This is actually English content.",
|
||||
{},
|
||||
force_language='es'
|
||||
)
|
||||
|
||||
assert 'es' in manager.get_languages()
|
||||
assert manager.get_document_count('es') == 1
|
||||
|
||||
|
||||
def test_filename_language_priority():
|
||||
"""Test filename pattern takes priority over content detection."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
# Filename says Spanish, but content is English
|
||||
manager.add_document(
|
||||
"guide.es.md",
|
||||
"This is English content.",
|
||||
{}
|
||||
)
|
||||
|
||||
# Should use filename language
|
||||
assert 'es' in manager.get_languages()
|
||||
|
||||
|
||||
def test_document_count_all():
|
||||
"""Test total document count."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("file1.md", "English doc 1", {})
|
||||
manager.add_document("file2.md", "English doc 2", {})
|
||||
manager.add_document("file3.es.md", "Spanish doc", {})
|
||||
|
||||
assert manager.get_document_count() == 3
|
||||
assert manager.get_document_count('en') == 2
|
||||
assert manager.get_document_count('es') == 1
|
||||
|
||||
|
||||
def test_primary_language():
|
||||
"""Test primary language is set correctly."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("file1.md", "First English doc", {})
|
||||
manager.add_document("file2.es.md", "Spanish doc", {})
|
||||
|
||||
# Primary should be first added
|
||||
assert manager.primary_language == 'en'
|
||||
|
||||
|
||||
def test_translation_status():
|
||||
"""Test translation status tracking."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("README.md", "English doc", {})
|
||||
manager.add_document("README.es.md", "Spanish doc", {})
|
||||
manager.add_document("README.fr.md", "French doc", {})
|
||||
|
||||
status = manager.get_translation_status()
|
||||
|
||||
assert status.source_language == 'en'
|
||||
assert 'es' in status.translated_languages
|
||||
assert 'fr' in status.translated_languages
|
||||
assert len(status.translated_languages) == 2
|
||||
|
||||
|
||||
def test_export_by_language():
|
||||
"""Test exporting documents by language."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("file1.md", "English content", {})
|
||||
manager.add_document("file2.es.md", "Spanish content", {})
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
exports = manager.export_by_language(Path(tmpdir))
|
||||
|
||||
assert len(exports) == 2
|
||||
assert 'en' in exports
|
||||
assert 'es' in exports
|
||||
|
||||
# Check files exist
|
||||
assert exports['en'].exists()
|
||||
assert exports['es'].exists()
|
||||
|
||||
# Check content
|
||||
en_data = json.loads(exports['en'].read_text())
|
||||
assert en_data['language'] == 'en'
|
||||
assert en_data['document_count'] == 1
|
||||
|
||||
|
||||
def test_translation_report_generation():
|
||||
"""Test translation report generation."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document("file1.md", "English doc", {})
|
||||
manager.add_document("file2.es.md", "Spanish doc", {})
|
||||
|
||||
report = manager.generate_translation_report()
|
||||
|
||||
assert "MULTI-LANGUAGE DOCUMENTATION REPORT" in report
|
||||
assert "Languages: 2" in report
|
||||
assert "English (en)" in report
|
||||
assert "Spanish (es)" in report
|
||||
|
||||
|
||||
def test_empty_manager():
|
||||
"""Test manager with no documents."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
assert len(manager.get_languages()) == 0
|
||||
assert manager.get_document_count() == 0
|
||||
assert manager.primary_language is None
|
||||
|
||||
|
||||
def test_script_detection():
|
||||
"""Test script type detection."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
# English uses Latin script
|
||||
en_info = detector.detect("This is English")
|
||||
assert en_info.script == 'Latin'
|
||||
|
||||
# Chinese uses Han script
|
||||
zh_info = detector.detect("这是中文")
|
||||
assert zh_info.script == 'Han'
|
||||
|
||||
|
||||
def test_confidence_scoring():
|
||||
"""Test confidence scoring."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
# Strong English signal
|
||||
strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English."
|
||||
lang_info = detector.detect(strong_en)
|
||||
|
||||
assert lang_info.code == 'en'
|
||||
assert lang_info.confidence > 0.3 # Should have decent confidence
|
||||
|
||||
|
||||
def test_metadata_preservation():
|
||||
"""Test metadata is preserved."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
metadata = {"category": "guide", "version": "1.0"}
|
||||
manager.add_document("file.md", "English content", metadata)
|
||||
|
||||
docs = manager.documents['en']
|
||||
assert len(docs) == 1
|
||||
assert docs[0]['metadata'] == metadata
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user